CollectionManager* MetadataLoader::makeEmptyCollectionManager() { CollectionManager* manager = new CollectionManager; manager->_maxCollVersion = ChunkVersion(1, 0, OID()); manager->_maxShardVersion = ChunkVersion(1, 0, OID()); dassert(manager->isValid()); return manager; }
/** * Returns true if request is a query for sharded indexes. */ static bool doShardedIndexQuery(OperationContext* txn, Request& r, const QuerySpec& qSpec) { // Extract the ns field from the query, which may be embedded within the "query" or // "$query" field. auto nsField = qSpec.filter()["ns"]; if (nsField.eoo()) { return false; } const NamespaceString indexNSSQuery(nsField.str()); auto status = grid.catalogCache()->getDatabase(txn, indexNSSQuery.db().toString()); if (!status.isOK()) { return false; } shared_ptr<DBConfig> config = status.getValue(); if (!config->isSharded(indexNSSQuery.ns())) { return false; } // if you are querying on system.indexes, we need to make sure we go to a shard // that actually has chunks. This is not a perfect solution (what if you just // look at all indexes), but better than doing nothing. ShardPtr shard; ChunkManagerPtr cm; config->getChunkManagerOrPrimary(indexNSSQuery.ns(), cm, shard); if (cm) { set<ShardId> shardIds; cm->getAllShardIds(&shardIds); verify(shardIds.size() > 0); shard = grid.shardRegistry()->getShard(*shardIds.begin()); } ShardConnection dbcon(shard->getConnString(), r.getns()); DBClientBase& c = dbcon.conn(); string actualServer; Message response; bool ok = c.call(r.m(), response, true, &actualServer); uassert(10200, "mongos: error calling db", ok); { QueryResult::View qr = response.singleData().view2ptr(); if (qr.getResultFlags() & ResultFlag_ShardConfigStale) { dbcon.done(); // Version is zero b/c this is deprecated codepath throw RecvStaleConfigException(r.getns(), "Strategy::doQuery", ChunkVersion(0, 0, OID()), ChunkVersion(0, 0, OID())); } } r.reply(response, actualServer.size() ? actualServer : c.getServerAddress()); dbcon.done(); return true; }
/** * Returns true if request is a query for sharded indexes. */ static bool doShardedIndexQuery( Request& r, const QuerySpec& qSpec ) { // Extract the ns field from the query, which may be embedded within the "query" or // "$query" field. string indexNSQuery(qSpec.filter()["ns"].str()); DBConfigPtr config = grid.getDBConfig( r.getns() ); if ( !config->isSharded( indexNSQuery )) { return false; } // if you are querying on system.indexes, we need to make sure we go to a shard // that actually has chunks. This is not a perfect solution (what if you just // look at all indexes), but better than doing nothing. ShardPtr shard; ChunkManagerPtr cm; config->getChunkManagerOrPrimary( indexNSQuery, cm, shard ); if ( cm ) { set<Shard> shards; cm->getAllShards( shards ); verify( shards.size() > 0 ); shard.reset( new Shard( *shards.begin() ) ); } ShardConnection dbcon( *shard , r.getns() ); DBClientBase &c = dbcon.conn(); string actualServer; Message response; bool ok = c.call( r.m(), response, true , &actualServer ); uassert( 10200 , "mongos: error calling db", ok ); { QueryResult *qr = (QueryResult *) response.singleData(); if ( qr->resultFlags() & ResultFlag_ShardConfigStale ) { dbcon.done(); // Version is zero b/c this is deprecated codepath throw RecvStaleConfigException( r.getns(), "Strategy::doQuery", ChunkVersion( 0, 0, OID() ), ChunkVersion( 0, 0, OID() )); } } r.reply( response , actualServer.size() ? actualServer : c.getServerAddress() ); dbcon.done(); return true; }
ShardChunkManager* ShardChunkManager::cloneMinus( const BSONObj& min, const BSONObj& max, const ChunkVersion& version ) { // check that we have the exact chunk that will be subtracted _assertChunkExists( min , max ); auto_ptr<ShardChunkManager> p( new ShardChunkManager ); p->_key = this->_key; if ( _chunksMap.size() == 1 ) { // if left with no chunks, just reset version uassert( 13590 , str::stream() << "setting version to " << version.toString() << " on removing last chunk", ! version.isSet() ); p->_version = ChunkVersion( 0, OID() ); p->_collVersion = _collVersion; } else { // can't move version backwards when subtracting chunks // this is what guarantees that no read or write would be taken once we subtract data from the current shard if ( version <= _version ) { uasserted( 13585 , str::stream() << "version " << version.toString() << " not greater than " << _version.toString() ); } p->_chunksMap = this->_chunksMap; p->_chunksMap.erase( min ); p->_version = version; if( version > _collVersion ) p->_collVersion = version; else p->_collVersion = this->_collVersion; p->_fillRanges(); } return p.release(); }
void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ) ; CollectionMetadataPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ChunkVersion( 0, 0, p->getCollVersion().epoch() ); ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); string errMsg; CollectionMetadataPtr cloned( p->cloneMigrate( chunk, version, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 16855, errMsg, NULL != cloned.get() ); // TODO: a bit dangerous to have two different zero-version states - no-metadata and // no-version _collMetadata[ns] = cloned; }
const ChunkVersion ShardedConnectionInfo::getVersion(const std::string& ns) const { NSVersionMap::const_iterator it = _versions.find(ns); if (it != _versions.end()) { return it->second; } else { return ChunkVersion(0, 0, OID()); } }
ChunkManager::ChunkManager(const CollectionType& coll) : _ns(coll.getNs().ns()), _keyPattern(coll.getKeyPattern()), _unique(coll.getUnique()), _sequenceNumber(NextSequenceNumber.addAndFetch(1)), _chunkRanges() { // coll does not have correct version. Use same initial version as _load and createFirstChunks. _version = ChunkVersion(0, 0, coll.getEpoch()); }
ChunkVersion ChunkManager::getVersion(const std::string& shardName) const { ShardVersionMap::const_iterator i = _shardVersions.find(shardName); if (i == _shardVersions.end()) { // Shards without explicitly tracked shard versions (meaning they have // no chunks) always have a version of (0, 0, epoch). Note this is // *different* from the dropped chunk version of (0, 0, OID(000...)). // See s/chunk_version.h. return ChunkVersion(0, 0, _version.epoch()); } return i->second; }
const ChunkVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) { CollectionMetadataPtr p = it->second; return p->getShardVersion(); } else { return ChunkVersion( 0, OID() ); } }
void ChunkManager::createFirstChunks(OperationContext* txn, const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds) { // TODO distlock? // TODO: Race condition if we shard the collection and insert data while we split across // the non-primary shard. vector<BSONObj> splitPoints; vector<ShardId> shardIds; calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds); // this is the first chunk; start the versioning from scratch ChunkVersion version; version.incEpoch(); version.incMajor(); log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns << " using new epoch " << version.epoch(); for (unsigned i = 0; i <= splitPoints.size(); i++) { BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1]; BSONObj max = i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax(); Chunk temp(this, min, max, shardIds[i % shardIds.size()], version); BSONObjBuilder chunkBuilder; temp.serialize(chunkBuilder); BSONObj chunkObj = chunkBuilder.obj(); Status result = grid.catalogManager(txn)->update(txn, ChunkType::ConfigNS, BSON(ChunkType::name(temp.genID())), chunkObj, true, false, NULL); version.incMinor(); if (!result.isOK()) { string ss = str::stream() << "creating first chunks failed. result: " << result.reason(); error() << ss; msgasserted(15903, ss); } } _version = ChunkVersion(0, 0, version.epoch()); }
void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); verify( it != _chunks.end() ) ; ShardChunkManagerPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ChunkVersion( 0 , OID() ); ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) ); // TODO: a bit dangerous to have two different zero-version states - no-manager and // no-version _chunks[ns] = cloned; }
Status ChunkManager::createFirstChunks(OperationContext* txn, const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds) { // TODO distlock? // TODO: Race condition if we shard the collection and insert data while we split across // the non-primary shard. vector<BSONObj> splitPoints; vector<ShardId> shardIds; calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds); // this is the first chunk; start the versioning from scratch ChunkVersion version(1, 0, OID::gen()); log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns << " using new epoch " << version.epoch(); for (unsigned i = 0; i <= splitPoints.size(); i++) { BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1]; BSONObj max = i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax(); ChunkType chunk; chunk.setName(Chunk::genID(_ns, min)); chunk.setNS(_ns); chunk.setMin(min); chunk.setMax(max); chunk.setShard(shardIds[i % shardIds.size()]); chunk.setVersion(version); Status status = grid.catalogManager(txn) ->insertConfigDocument(txn, ChunkType::ConfigNS, chunk.toBSON()); if (!status.isOK()) { const string errMsg = str::stream() << "Creating first chunks failed: " << status.reason(); error() << errMsg; return Status(status.code(), errMsg); } version.incMinor(); } _version = ChunkVersion(0, 0, version.epoch()); return Status::OK(); }
Status MetadataLoader::initCollection( const string& ns, const string& shard, CollectionMetadata* metadata ) { // // Bring collection entry from the config server. // BSONObj collObj; { try { ScopedDbConnection conn( _configLoc.toString(), 30 ); collObj = conn->findOne( CollectionType::ConfigNS, QUERY(CollectionType::ns()<<ns)); conn.done(); } catch ( const DBException& e ) { string errMsg = str::stream() << "could not query collection metadata" << causedBy( e ); // We deliberately do not return conn to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } } CollectionType collDoc; string errMsg; if ( !collDoc.parseBSON( collObj, &errMsg ) || !collDoc.isValid( &errMsg ) ) { return Status( ErrorCodes::FailedToParse, errMsg ); } // // Load or generate default chunks for collection config. // if ( collDoc.isKeyPatternSet() && !collDoc.getKeyPattern().isEmpty() ) { metadata->_keyPattern = collDoc.getKeyPattern(); metadata->_shardVersion = ChunkVersion( 0, 0, collDoc.getEpoch() ); metadata->_collVersion = ChunkVersion( 0, 0, collDoc.getEpoch() ); return Status::OK(); } else if ( collDoc.isPrimarySet() && collDoc.getPrimary() == shard ) { if ( shard == "" ) { warning() << "shard not verified, assuming collection " << ns << " is unsharded on this shard" << endl; } metadata->_keyPattern = BSONObj(); metadata->_shardVersion = ChunkVersion( 1, 0, collDoc.getEpoch() ); metadata->_collVersion = metadata->_shardVersion; return Status::OK(); } else { errMsg = str::stream() << "collection " << ns << " does not have a shard key " << "and primary " << ( collDoc.isPrimarySet() ? collDoc.getPrimary() : "" ) << " does not match this shard " << shard; return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } }
Status MetadataLoader::initChunks( const string& ns, const string& shard, const CollectionMetadata* oldMetadata, CollectionMetadata* metadata ) { map<string, ChunkVersion> versionMap; OID epoch = metadata->getCollVersion().epoch(); // Check to see if we should use the old version or not. if ( oldMetadata ) { ChunkVersion oldVersion = oldMetadata->getShardVersion(); if ( oldVersion.isSet() && oldVersion.hasCompatibleEpoch( epoch ) ) { // Our epoch for coll version and shard version should be the same. verify( oldMetadata->getCollVersion().hasCompatibleEpoch( epoch ) ); versionMap[shard] = oldMetadata->_shardVersion; metadata->_collVersion = oldMetadata->_collVersion; // TODO: This could be made more efficient if copying not required, but // not as frequently reloaded as in mongos. metadata->_chunksMap = oldMetadata->_chunksMap; LOG(2) << "loading new chunks for collection " << ns << " using old metadata w/ version " << oldMetadata->getShardVersion() << " and " << metadata->_chunksMap.size() << " chunks" << endl; } } // Exposes the new metadata's range map and version to the "differ," who // would ultimately be responsible of filling them up. SCMConfigDiffTracker differ( shard ); differ.attach( ns, metadata->_chunksMap, metadata->_collVersion, versionMap ); try { ScopedDbConnection conn( _configLoc.toString(), 30 ); auto_ptr<DBClientCursor> cursor = conn->query( ChunkType::ConfigNS, differ.configDiffQuery() ); if ( !cursor.get() ) { metadata->_collVersion = ChunkVersion(); metadata->_chunksMap.clear(); conn.done(); return Status( ErrorCodes::HostUnreachable, "problem opening chunk metadata cursor" ); } // Diff tracker should *always* find at least one chunk if this shard owns a chunk. int diffsApplied = differ.calculateConfigDiff( *cursor ); if ( diffsApplied > 0 ) { LOG(2) << "loaded " << diffsApplied << " chunks into new metadata for " << ns << " with version " << metadata->_collVersion << endl; metadata->_shardVersion = versionMap[shard]; metadata->fillRanges(); conn.done(); return Status::OK(); } else if ( diffsApplied == 0 ) { warning() << "no chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return Status::OK(); } else { // TODO: make this impossible by making sure we don't migrate / split on this // shard during the reload. No chunks were found for the ns. string errMsg = str::stream() << "invalid chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ", this should be rare"; warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } } catch ( const DBException& e ) { string errMsg = str::stream() << "problem querying chunks metadata" << causedBy( e ); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } }
bool MetadataLoader::initCollection(const string& ns, const string& shard, const CollectionManager* oldManager, CollectionManager* manager, string* errMsg) { // // Bring collection entry from the config server. // BSONObj collObj; { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset( ScopedDbConnection::getInternalScopedDbConnection(_configLoc.toString(), 30)); ScopedDbConnection& conn = *connPtr; collObj = conn->findOne(CollectionType::ConfigNS, QUERY(CollectionType::ns()<<ns)); } catch (const DBException& e) { *errMsg = str::stream() << "caught exception accessing the config servers " << causedBy(e); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return false; } connPtr->done(); } CollectionType collDoc; if (!collDoc.parseBSON(collObj, errMsg) || !collDoc.isValid(errMsg)) { return false; } // // Load or generate default chunks for collection config. // if (!collDoc.getKeyPattern().isEmpty()) { manager->_key = collDoc.getKeyPattern(); if(!initChunks(collDoc, ns, shard, oldManager, manager, errMsg)){ return false; } } else if(collDoc.getPrimary() == shard) { if (shard == "") { warning() << "shard not verified, assuming collection " << ns << " is unsharded on this shard" << endl; } manager->_key = BSONObj(); manager->_maxShardVersion = ChunkVersion(1, 0, collDoc.getEpoch()); manager->_maxCollVersion = manager->_maxShardVersion; } else { *errMsg = str::stream() << "collection " << ns << " does not have a shard key " << "and primary " << collDoc.getPrimary() << " does not match this shard " << shard; return false; } return true; }
Status MetadataLoader::initCollection( const string& ns, const string& shard, CollectionMetadata* metadata ) const { // // Bring collection entry from the config server. // BSONObj collDoc; { try { ScopedDbConnection conn( _configLoc.toString(), 30 ); collDoc = conn->findOne( CollectionType::ConfigNS, QUERY(CollectionType::ns()<<ns)); conn.done(); } catch ( const DBException& e ) { string errMsg = str::stream() << "could not query collection metadata" << causedBy( e ); // We deliberately do not return conn to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } } string errMsg; if ( collDoc.isEmpty() ) { errMsg = str::stream() << "could not load metadata, collection " << ns << " not found"; warning() << errMsg << endl; return Status( ErrorCodes::NamespaceNotFound, errMsg ); } CollectionType collInfo; if ( !collInfo.parseBSON( collDoc, &errMsg ) || !collInfo.isValid( &errMsg ) ) { errMsg = str::stream() << "could not parse metadata for collection " << ns << causedBy( errMsg ); warning() << errMsg << endl; return Status( ErrorCodes::FailedToParse, errMsg ); } if ( collInfo.isDroppedSet() && collInfo.getDropped() ) { errMsg = str::stream() << "could not load metadata, collection " << ns << " was dropped"; warning() << errMsg << endl; return Status( ErrorCodes::NamespaceNotFound, errMsg ); } if ( collInfo.isKeyPatternSet() && !collInfo.getKeyPattern().isEmpty() ) { // Sharded collection, need to load chunks metadata->_keyPattern = collInfo.getKeyPattern(); metadata->_shardVersion = ChunkVersion( 0, 0, collInfo.getEpoch() ); metadata->_collVersion = ChunkVersion( 0, 0, collInfo.getEpoch() ); return Status::OK(); } else if ( collInfo.isPrimarySet() && collInfo.getPrimary() == shard ) { // A collection with a non-default primary // Empty primary field not allowed if set dassert( collInfo.getPrimary() != "" ); metadata->_keyPattern = BSONObj(); metadata->_shardVersion = ChunkVersion( 1, 0, collInfo.getEpoch() ); metadata->_collVersion = metadata->_shardVersion; return Status::OK(); } else { // A collection with a primary that doesn't match this shard or is empty, the primary // may have changed before we loaded. errMsg = // br str::stream() << "collection " << ns << " does not have a shard key " << "and primary " << ( collInfo.isPrimarySet() ? collInfo.getPrimary() : "" ) << " does not match this shard " << shard; warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } }
bool MetadataLoader::initChunks(const CollectionType& collDoc, const string& ns, const string& shard, const CollectionManager* oldManager, CollectionManager* manager, string* errMsg) { map<string,ChunkVersion> versionMap; manager->_maxCollVersion = ChunkVersion(0, 0, collDoc.getEpoch()); // Check to see if we should use the old version or not. if (oldManager) { ChunkVersion oldVersion = oldManager->getMaxShardVersion(); if (oldVersion.isSet() && oldVersion.hasCompatibleEpoch(collDoc.getEpoch())) { // Our epoch for coll version and shard version should be the same. verify(oldManager->getMaxCollVersion().hasCompatibleEpoch(collDoc.getEpoch())); versionMap[shard] = oldManager->_maxShardVersion; manager->_maxCollVersion = oldManager->_maxCollVersion; // TODO: This could be made more efficient if copying not required, but // not as frequently reloaded as in mongos. manager->_chunksMap = oldManager->_chunksMap; LOG(2) << "loading new chunks for collection " << ns << " using old chunk manager w/ version " << oldManager->getMaxShardVersion() << " and " << manager->_chunksMap.size() << " chunks" << endl; } } // Exposes the new 'manager's range map and version to the "differ," who // would ultimately be responsible of filling them up. SCMConfigDiffTracker differ(shard); differ.attach(ns, manager->_chunksMap, manager->_maxCollVersion, versionMap); try { scoped_ptr<ScopedDbConnection> connPtr( ScopedDbConnection::getInternalScopedDbConnection(_configLoc.toString(), 30)); ScopedDbConnection& conn = *connPtr; auto_ptr<DBClientCursor> cursor = conn->query(ChunkType::ConfigNS, differ.configDiffQuery()); if (!cursor.get()) { // 'errMsg' was filled by the getChunkCursor() call. manager->_maxCollVersion = ChunkVersion(); manager->_chunksMap.clear(); connPtr->done(); return false; } // Diff tracker should *always* find at least one chunk if collection exists. int diffsApplied = differ.calculateConfigDiff(*cursor); if (diffsApplied > 0) { LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << ns << " with version " << manager->_maxCollVersion << endl; manager->_maxShardVersion = versionMap[shard]; manager->fillRanges(); connPtr->done(); return true; } else if(diffsApplied == 0) { *errMsg = str::stream() << "no chunks found when reloading " << ns << ", previous version was " << manager->_maxCollVersion.toString(); warning() << *errMsg << endl; manager->_maxCollVersion = ChunkVersion(); manager->_chunksMap.clear(); connPtr->done(); return false; } else{ // TODO: make this impossible by making sure we don't migrate / split on this // shard during the reload. No chunks were found for the ns. *errMsg = str::stream() << "invalid chunks found when reloading " << ns << ", previous version was " << manager->_maxCollVersion.toString() << ", this should be rare"; warning() << errMsg << endl; manager->_maxCollVersion = ChunkVersion(); manager->_chunksMap.clear(); connPtr->done(); return false; } } catch (const DBException& e) { *errMsg = str::stream() << "caught exception accessing the config servers" << causedBy(e); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return false; } }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){ result.append( "initialized", true ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } if( ! ConfigVersion::canParseBSON( cmdObj, "version" ) ){ errmsg = "need to specify version"; return false; } const ConfigVersion version = ConfigVersion::fromBSON( cmdObj, "version" ); // step 3 const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isWriteCompatibleWith( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isWriteCompatibleWith( version ) ) { if ( oldVersion < globalVersion && oldVersion.hasCompatibleEpoch(globalVersion) ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); Lock::GlobalWrite setShardVersionLock; // TODO: can we get rid of this?? if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } if ( ! version.isSet() && globalVersion.isSet() ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; globalVersion.addToBSON( result, "beforeDrop" ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } // TODO: Refactor all of this if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } // TODO: Refactor all of this if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; dbtemprelease r; shardingState.waitTillNotInCriticalSection( 10 ); } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; dbtemprelease r; shardingState.waitTillNotInCriticalSection( 10 ); } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } Timer relockTime; { dbtemprelease unlock; ChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for collection '" << ns << "'"; result.append( "ns" , ns ); // If this was a reset of a collection, inform mongos to do a full reload if( ! currVersion.isSet() ){ ConfigVersion( 0, OID() ).addToBSON( result, "version" ); result.appendBool( "reloadConfig", true ); } else{ version.addToBSON( result, "version" ); } globalVersion.addToBSON( result, "globalVersion" ); return false; } } if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) { log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl; } info->setVersion( ns , version ); return true; }
/** * Updates the remote cached version on the remote shard host (primary, in the case of replica * sets) if needed with a fully-qualified shard version for the given namespace: * config server(s) + shard name + shard version * * If no remote cached version has ever been set, an initial shard version is sent. * * If the namespace is empty and no version has ever been sent, the config server + shard name * is sent to the remote shard host to initialize the connection as coming from mongos. * NOTE: This initialization is *best-effort only*. Operations which wish to correctly version * must send the namespace. * * Config servers are special and are not (unless otherwise a shard) kept up to date with this * protocol. This is safe so long as config servers only contain unversioned collections. * * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB). * * @return true if we contacted the remote host */ bool checkShardVersion(DBClientBase* conn_in, const string& ns, ChunkManagerPtr refManager, bool authoritative, int tryNumber) { // TODO: cache, optimize, etc... // Empty namespaces are special - we require initialization but not versioning if (ns.size() == 0) { return initShardVersionEmptyNS(conn_in); } auto status = grid.catalogCache()->getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return false; } shared_ptr<DBConfig> conf = status.getValue(); DBClientBase* conn = getVersionable(conn_in); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ShardPtr primary; ChunkManagerPtr manager; if (authoritative) conf->getChunkManagerIfExists(ns, true); conf->getChunkManagerOrPrimary(ns, manager, primary); if (manager) { officialSequenceNumber = manager->getSequenceNumber(); } const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress()); uassert(ErrorCodes::ShardNotFound, str::stream() << conn->getServerAddress() << " is not recognized as a shard", shard); // Check this manager against the reference manager if (manager) { if (refManager && !refManager->compatibleWith(*manager, shard->getId())) { const ChunkVersion refVersion(refManager->getVersion(shard->getId())); const ChunkVersion currentVersion(manager->getVersion(shard->getId())); string msg(str::stream() << "manager (" << currentVersion.toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refVersion.toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard->getId() << " (" << shard->getConnString().toString() << ")"); throw SendStaleConfigException(ns, msg, refVersion, currentVersion); } } else if (refManager) { string msg(str::stream() << "not sharded (" << ((manager.get() == 0) ? string("<none>") : str::stream() << manager->getSequenceNumber()) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")"); throw SendStaleConfigException( ns, msg, refManager->getVersion(shard->getId()), ChunkVersion::UNSHARDED()); } // Do not send setShardVersion to collections on the config servers - this causes problems // when config servers are also shards and get SSV with conflicting names. // TODO: Make config servers regular shards if (primary && primary->getId() == "config") { return false; } // Has the ChunkManager been reloaded since the last time we updated the shard version over // this connection? If we've never updated the shard version, do so now. unsigned long long sequenceNumber = 0; if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) { if (sequenceNumber == officialSequenceNumber) { return false; } } ChunkVersion version = ChunkVersion(0, 0, OID()); if (manager) { version = manager->getVersion(shard->getId()); } LOG(1) << "setting shard version of " << version << " for " << ns << " on shard " << shard->toString(); LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber << ", current chunk manager iteration is " << officialSequenceNumber; BSONObj result; if (setShardVersion(*conn, ns, grid.catalogManager()->connectionString().toString(), version, manager.get(), authoritative, result)) { LOG(1) << " setShardVersion success: " << result; connectionShardStatus.setSequence(conn, ns, officialSequenceNumber); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if (result["need_authoritative"].trueValue()) massert(10428, "need_authoritative set but in authoritative mode already", !authoritative); if (!authoritative) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if (result["reloadConfig"].trueValue()) { if (result["version"].timestampTime() == Date_t()) { warning() << "reloading full configuration for " << conf->name() << ", connection state indicates significant version changes"; // reload db conf->reload(); } else { // reload config conf->getChunkManager(ns, true); } } const int maxNumTries = 7; if (tryNumber < maxNumTries) { LOG(tryNumber < (maxNumTries / 2) ? 1 : 0) << "going to retry checkShardVersion shard: " << shard->toString() << " " << result; sleepmillis(10 * tryNumber); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed shard: " << shard->toString() << " " << result; log() << " " << errmsg << endl; massert(10429, errmsg, 0); return true; }
Query ConfigDiffTracker<ValType,ShardType>:: configDiffQuery( const set<ChunkVersion>& extraMinorVersions ) const { verifyAttached(); // // Basic idea behind the query is to find all the chunks $gt the current max version, and // then also update chunks that we need minor versions - splits and (2.0) max chunks on // shards // static const int maxMinorVersionClauses = 50; BSONObjBuilder queryB; int numStaleMinorClauses = extraMinorVersions.size() + _maxShardVersions->size(); #ifdef _DEBUG // In debug builds, randomly trigger full reloads to exercise both codepaths if( rand() % 2 ) numStaleMinorClauses = maxMinorVersionClauses; #endif queryB.append(ChunkType::ns(), _ns); // // If we have only a few minor versions to refresh, we can be more selective in our query // if( numStaleMinorClauses < maxMinorVersionClauses ) { // // Get any version changes higher than we know currently // BSONArrayBuilder queryOrB( queryB.subarrayStart( "$or" ) ); { BSONObjBuilder queryNewB( queryOrB.subobjStart() ); { BSONObjBuilder ts(queryNewB.subobjStart(ChunkType::DEPRECATED_lastmod())); // We should *always* pull at least a single chunk back, this lets us quickly // detect if our collection was unsharded (and most of the time if it was // resharded) in the meantime ts.appendTimestamp( "$gte", _maxVersion->toLong() ); ts.done(); } queryNewB.done(); } // Get any shard version changes higher than we know currently // Needed since there could have been a split of the max version chunk of any shard // TODO: Ideally, we shouldn't care about these for( typename map<ShardType, ChunkVersion>::const_iterator it = _maxShardVersions->begin(); it != _maxShardVersions->end(); it++ ) { BSONObjBuilder queryShardB( queryOrB.subobjStart() ); queryShardB.append(ChunkType::shard(), nameFrom( it->first ) ); { BSONObjBuilder ts(queryShardB.subobjStart(ChunkType::DEPRECATED_lastmod())); ts.appendTimestamp( "$gt", it->second.toLong() ); ts.done(); } queryShardB.done(); } // Get any minor version changes we've marked as interesting // TODO: Ideally we shouldn't care about these for( set<ChunkVersion>::const_iterator it = extraMinorVersions.begin(); it != extraMinorVersions.end(); it++ ) { BSONObjBuilder queryShardB( queryOrB.subobjStart() ); { BSONObjBuilder ts(queryShardB.subobjStart(ChunkType::DEPRECATED_lastmod())); ts.appendTimestamp( "$gt", it->toLong() ); ts.appendTimestamp( "$lt", ChunkVersion( it->majorVersion() + 1, 0, OID() ).toLong() ); ts.done(); } queryShardB.done(); } queryOrB.done(); } BSONObj query = queryB.obj(); LOG(2) << "major version query from " << *_maxVersion << " and over " << _maxShardVersions->size() << " shards is " << query << endl; // // NOTE: IT IS IMPORTANT FOR CONSISTENCY THAT WE SORT BY ASC VERSION, TO HANDLE // CURSOR YIELDING BETWEEN CHUNKS BEING MIGRATED. // // This ensures that changes to chunk version (which will always be higher) will always // come *after* our current position in the chunk cursor. // Query queryObj(query); queryObj.sort(BSON( "lastmod" << 1 )); return Query( query ); }
void ShardChunkManager::_init( const string& configServer , const string& ns , const string& shardName, ShardChunkManagerPtr oldManager ) { // have to get a connection to the config db // special case if I'm the configdb since I'm locked and if I connect to myself // its a deadlock scoped_ptr<ScopedDbConnection> scoped; scoped_ptr<DBDirectClient> direct; DBClientBase * conn; if ( configServer.empty() ) { direct.reset( new DBDirectClient() ); conn = direct.get(); } else { scoped.reset( ScopedDbConnection::getInternalScopedDbConnection( configServer, 30.0 ) ); conn = scoped->get(); } // get this collection's sharding key BSONObj collectionDoc = conn->findOne(CollectionType::ConfigNS, BSON(CollectionType::ns(ns))); if( collectionDoc.isEmpty() ){ warning() << ns << " does not exist as a sharded collection" << endl; return; } if( collectionDoc[CollectionType::dropped()].Bool() ){ warning() << ns << " was dropped. Re-shard collection first." << endl; return; } _fillCollectionKey( collectionDoc ); map<string,ChunkVersion> versionMap; versionMap[ shardName ] = _version; _collVersion = ChunkVersion( 0, OID() ); // Check to see if we have an old ShardChunkManager to use if( oldManager && oldManager->_collVersion.isSet() ){ versionMap[ shardName ] = oldManager->_version; _collVersion = oldManager->_collVersion; // TODO: This could be made more efficient if copying not required, but not as // frequently reloaded as in mongos. _chunksMap = oldManager->_chunksMap; LOG(2) << "loading new chunks for collection " << ns << " using old chunk manager w/ version " << _collVersion << " and " << _chunksMap.size() << " chunks" << endl; } // Attach our config diff tracker to our range map and versions SCMConfigDiffTracker differ( shardName ); differ.attach( ns, _chunksMap, _collVersion, versionMap ); // Need to do the query ourselves, since we may use direct conns to the db Query query = differ.configDiffQuery(); auto_ptr<DBClientCursor> cursor = conn->query(ChunkType::ConfigNS, query); uassert( 16181, str::stream() << "could not initialize cursor to config server chunks collection for ns " << ns, cursor.get() ); // Diff tracker should *always* find at least one chunk if collection exists int diffsApplied = differ.calculateConfigDiff( *cursor ); if( diffsApplied > 0 ){ LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << ns << " with version " << _collVersion << endl; // Save the new version of this shard _version = versionMap[ shardName ]; _fillRanges(); } else if( diffsApplied == 0 ){ // No chunks were found for the ns warning() << "no chunks found when reloading " << ns << ", previous version was " << _collVersion << endl; _version = ChunkVersion( 0, OID() ); _collVersion = ChunkVersion( 0, OID() ); _chunksMap.clear(); } else{ // TODO: make this impossible by making sure we don't migrate / split on this shard during the // reload // No chunks were found for the ns warning() << "invalid chunks found when reloading " << ns << ", previous version was " << _collVersion << ", this should be rare" << endl; // Handle the same way as a connectivity error, for now // TODO: handle inline uassert( 16229, str::stream() << "could not initialize cursor to config server chunks collection for ns " << ns, cursor.get() ); } if ( scoped.get() ) scoped->done(); if ( _chunksMap.empty() ) log() << "no chunk for collection " << ns << " on shard " << shardName << endl; }
/** * @return true if had to do something */ bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) { // TODO: cache, optimize, etc... WriteBackListener::init( *conn_in ); DBConfigPtr conf = grid.getDBConfig( ns ); if ( ! conf ) return false; DBClientBase* conn = getVersionable( conn_in ); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ChunkManagerPtr manager; const bool isSharded = conf->isSharded( ns ); if ( isSharded ) { manager = conf->getChunkManagerIfExists( ns , authoritative ); // It's possible the chunk manager was reset since we checked whether sharded was true, // so must check this here. if( manager ) officialSequenceNumber = manager->getSequenceNumber(); } // Check this manager against the reference manager if( isSharded && manager ){ Shard shard = Shard::make( conn->getServerAddress() ); if( refManager && ! refManager->compatibleWith( manager, shard ) ){ throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard ).toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refManager->getVersion( shard ).toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")", refManager->getVersion( shard ), manager->getVersion( shard ) ); } } else if( refManager ){ Shard shard = Shard::make( conn->getServerAddress() ); string msg( str::stream() << "not sharded (" << ( (manager.get() == 0) ? string( "<none>" ) : str::stream() << manager->getSequenceNumber() ) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")" ); throw SendStaleConfigException( ns, msg, refManager->getVersion( shard ), ChunkVersion( 0, OID() )); } // has the ChunkManager been reloaded since the last time we updated the connection-level version? // (ie., last time we issued the setShardVersions below) unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns); if ( sequenceNumber == officialSequenceNumber ) { return false; } ChunkVersion version = ChunkVersion( 0, OID() ); if ( isSharded && manager ) { version = manager->getVersion( Shard::make( conn->getServerAddress() ) ); } if( ! version.isSet() ){ LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " << ( ! isSharded ? "no longer sharded" : ( ! manager ? "no chunk manager found" : "version is zero" ) ) << endl; } LOG(2) << " have to set shard version for conn: " << conn->getServerAddress() << " ns:" << ns << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber << " version: " << version << " manager: " << manager.get() << endl; const string versionableServerAddress(conn->getServerAddress()); BSONObj result; if ( setShardVersion( *conn , ns , version , manager , authoritative , result ) ) { // success! LOG(1) << " setShardVersion success: " << result << endl; connectionShardStatus.setSequence( conn , ns , officialSequenceNumber ); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if ( result["need_authoritative"].trueValue() ) massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); if ( ! authoritative ) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if ( result["reloadConfig"].trueValue() ) { if( result["version"].timestampTime() == 0 ){ warning() << "reloading full configuration for " << conf->getName() << ", connection state indicates significant version changes" << endl; // reload db conf->reload(); } else { // reload config conf->getChunkManager( ns , true ); } } const int maxNumTries = 7; if ( tryNumber < maxNumTries ) { LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) << "going to retry checkShardVersion host: " << versionableServerAddress << " " << result << endl; sleepmillis( 10 * tryNumber ); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed host: " << versionableServerAddress << " " << result; log() << " " << errmsg << endl; massert( 10429 , errmsg , 0 ); return true; }
/** * Special internal logic to run reduced version handshake for empty namespace operations to * shards. * * Eventually this should go completely away, but for now many commands rely on unversioned but * mongos-specific behavior on mongod (auditing and replication information in commands) */ static bool initShardVersionEmptyNS(DBClientBase* conn_in) { bool ok; BSONObj result; DBClientBase* conn = NULL; try { // May throw if replica set primary is down conn = getVersionable(conn_in); dassert(conn); // errors thrown above // Check to see if we've already initialized this connection if (connectionShardStatus.hasAnySequenceSet(conn)) return false; // Check to see if this is actually a shard and not a single config server // NOTE: Config servers are registered only by the name "config" in the shard cache, not // by host, so lookup by host will fail unless the host is also a shard. const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress()); if (!shard) { return false; } LOG(1) << "initializing shard connection to " << shard->toString() << endl; ok = setShardVersion(*conn, "", grid.catalogManager()->connectionString().toString(), ChunkVersion(), NULL, true, result); } catch (const DBException&) { // NOTE: Replica sets may fail to initShardVersion because future calls relying on // correct versioning must later call checkShardVersion on the primary. // Secondary queries and commands may not call checkShardVersion, but secondary ops // aren't versioned at all. if (conn_in->type() != ConnectionString::SET) { throw; } // NOTE: Only old-style cluster operations will talk via DBClientReplicaSets - using // checkShardVersion is required (which includes initShardVersion information) if these // connections are used. OCCASIONALLY { warning() << "failed to initialize new replica set connection version, " << "will initialize on first use" << endl; } return false; } // Record the connection wire version if sent in the response, initShardVersion is a // handshake for mongos->mongod connections. if (!result["minWireVersion"].eoo()) { int minWireVersion = result["minWireVersion"].numberInt(); int maxWireVersion = result["maxWireVersion"].numberInt(); conn->setWireVersions(minWireVersion, maxWireVersion); } LOG(3) << "initial sharding result : " << result << endl; connectionShardStatus.setSequence(conn, "", 0); return ok; }
Status MetadataLoader::initChunks( const string& ns, const string& shard, const CollectionMetadata* oldMetadata, CollectionMetadata* metadata ) const { map<string, ChunkVersion> versionMap; // Preserve the epoch versionMap[shard] = metadata->_shardVersion; OID epoch = metadata->getCollVersion().epoch(); bool fullReload = true; // Check to see if we should use the old version or not. if ( oldMetadata ) { // If our epochs are compatible, it's useful to use the old metadata for diffs if ( oldMetadata->getCollVersion().hasCompatibleEpoch( epoch ) ) { fullReload = false; dassert( oldMetadata->isValid() ); versionMap[shard] = oldMetadata->_shardVersion; metadata->_collVersion = oldMetadata->_collVersion; // TODO: This could be made more efficient if copying not required, but // not as frequently reloaded as in mongos. metadata->_chunksMap = oldMetadata->_chunksMap; LOG( 2 ) << "loading new chunks for collection " << ns << " using old metadata w/ version " << oldMetadata->getShardVersion() << " and " << metadata->_chunksMap.size() << " chunks" << endl; } else { warning() << "reloading collection metadata for " << ns << " with new epoch " << epoch.toString() << ", the current epoch is " << oldMetadata->getCollVersion().epoch().toString() << endl; } } // Exposes the new metadata's range map and version to the "differ," who // would ultimately be responsible of filling them up. SCMConfigDiffTracker differ( shard ); differ.attach( ns, metadata->_chunksMap, metadata->_collVersion, versionMap ); try { ScopedDbConnection conn( _configLoc.toString(), 30 ); auto_ptr<DBClientCursor> cursor = conn->query( ChunkType::ConfigNS, differ.configDiffQuery() ); if ( !cursor.get() ) { // Make our metadata invalid metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return Status( ErrorCodes::HostUnreachable, "problem opening chunk metadata cursor" ); } // // The diff tracker should always find at least one chunk (the highest chunk we saw // last time). If not, something has changed on the config server (potentially between // when we read the collection data and when we read the chunks data). // int diffsApplied = differ.calculateConfigDiff( *cursor ); if ( diffsApplied > 0 ) { // Chunks found, return ok LOG(2) << "loaded " << diffsApplied << " chunks into new metadata for " << ns << " with version " << metadata->_collVersion << endl; metadata->_shardVersion = versionMap[shard]; metadata->fillRanges(); conn.done(); dassert( metadata->isValid() ); return Status::OK(); } else if ( diffsApplied == 0 ) { // No chunks found, the collection is dropping or we're confused // If this is a full reload, assume it is a drop for backwards compatibility // TODO: drop the config.collections entry *before* the chunks and eliminate this // ambiguity string errMsg = str::stream() << "no chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ( fullReload ? ", this is a drop" : "" ); warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return fullReload ? Status( ErrorCodes::NamespaceNotFound, errMsg ) : Status( ErrorCodes::RemoteChangeDetected, errMsg ); } else { // Invalid chunks found, our epoch may have changed because we dropped/recreated // the collection. string errMsg = // br str::stream() << "invalid chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ", this should be rare"; warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } } catch ( const DBException& e ) { string errMsg = str::stream() << "problem querying chunks metadata" << causedBy( e ); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } }
int ConfigDiffTracker<ValType,ShardType>:: calculateConfigDiff( DBClientCursorInterface& diffCursor ) { verifyAttached(); // Apply the chunk changes to the ranges and versions // // Overall idea here is to work in two steps : // 1. For all the new chunks we find, increment the maximum version per-shard and // per-collection, and remove any conflicting chunks from the ranges // 2. For all the new chunks we're interested in (all of them for mongos, just chunks on the // shard for mongod) add them to the ranges // vector<BSONObj> newTracked; // Store epoch now so it doesn't change when we change max OID currEpoch = _maxVersion->epoch(); _validDiffs = 0; while( diffCursor.more() ) { BSONObj diffChunkDoc = diffCursor.next(); ChunkVersion chunkVersion = ChunkVersion::fromBSON(diffChunkDoc, ChunkType::DEPRECATED_lastmod()); if( diffChunkDoc[ChunkType::min()].type() != Object || diffChunkDoc[ChunkType::max()].type() != Object || diffChunkDoc[ChunkType::shard()].type() != String ) { warning() << "got invalid chunk document " << diffChunkDoc << " when trying to load differing chunks" << endl; continue; } if( ! chunkVersion.isSet() || ! chunkVersion.hasCompatibleEpoch( currEpoch ) ) { warning() << "got invalid chunk version " << chunkVersion << " in document " << diffChunkDoc << " when trying to load differing chunks at version " << ChunkVersion( _maxVersion->toLong(), currEpoch ) << endl; // Don't keep loading, since we know we'll be broken here return -1; } _validDiffs++; // Get max changed version and chunk version if( chunkVersion > *_maxVersion ) *_maxVersion = chunkVersion; // Chunk version changes ShardType shard = shardFor( diffChunkDoc[ChunkType::shard()].String() ); typename map<ShardType, ChunkVersion>::iterator shardVersionIt = _maxShardVersions->find( shard ); if( shardVersionIt == _maxShardVersions->end() || shardVersionIt->second < chunkVersion ) { (*_maxShardVersions)[ shard ] = chunkVersion; } // See if we need to remove any chunks we are currently tracking b/c of this chunk's changes removeOverlapping(diffChunkDoc[ChunkType::min()].Obj(), diffChunkDoc[ChunkType::max()].Obj()); // Figure out which of the new chunks we need to track // Important - we need to actually own this doc, in case the cursor decides to getMore or unbuffer if( isTracked( diffChunkDoc ) ) newTracked.push_back( diffChunkDoc.getOwned() ); } LOG(3) << "found " << _validDiffs << " new chunks for collection " << _ns << " (tracking " << newTracked.size() << "), new version is " << *_maxVersion << endl; for( vector<BSONObj>::iterator it = newTracked.begin(); it != newTracked.end(); it++ ) { BSONObj chunkDoc = *it; // Important - we need to make sure we actually own the min and max here BSONObj min = chunkDoc[ChunkType::min()].Obj().getOwned(); BSONObj max = chunkDoc[ChunkType::max()].Obj().getOwned(); // Invariant enforced by sharding // It's possible to read inconsistent state b/c of getMore() and yielding, so we want // to detect as early as possible. // TODO: This checks for overlap, we also should check for holes here iff we're tracking // all chunks if( isOverlapping( min, max ) ) return -1; _currMap->insert( rangeFor( chunkDoc, min, max ) ); } return _validDiffs; }
void OperationShardingState::_clear() { _hasVersion = false; _shardVersion = ChunkVersion(); _ns = NamespaceString(); }
bool ChunkManager::_load(ChunkMap& chunkMap, set<ShardId>& shardIds, ShardVersionMap* shardVersions, const ChunkManager* oldManager) { // Reset the max version, but not the epoch, when we aren't loading from the oldManager _version = ChunkVersion(0, 0, _version.epoch()); // If we have a previous version of the ChunkManager to work from, use that info to reduce // our config query if (oldManager && oldManager->getVersion().isSet()) { // Get the old max version _version = oldManager->getVersion(); // Load a copy of the old versions *shardVersions = oldManager->_shardVersions; // Load a copy of the chunk map, replacing the chunk manager with our own const ChunkMap& oldChunkMap = oldManager->getChunkMap(); // Could be v.expensive // TODO: If chunks were immutable and didn't reference the manager, we could do more // interesting things here for (const auto& oldChunkMapEntry : oldChunkMap) { shared_ptr<Chunk> oldC = oldChunkMapEntry.second; shared_ptr<Chunk> newC(new Chunk( this, oldC->getMin(), oldC->getMax(), oldC->getShardId(), oldC->getLastmod())); newC->setBytesWritten(oldC->getBytesWritten()); chunkMap.insert(make_pair(oldC->getMax(), newC)); } LOG(2) << "loading chunk manager for collection " << _ns << " using old chunk manager w/ version " << _version.toString() << " and " << oldChunkMap.size() << " chunks"; } // Attach a diff tracker for the versioned chunk data CMConfigDiffTracker differ(this); differ.attach(_ns, chunkMap, _version, *shardVersions); // Diff tracker should *always* find at least one chunk if collection exists // Get the diff query required auto diffQuery = differ.configDiffQuery(); std::vector<ChunkType> chunks; uassertStatusOK( grid.catalogManager()->getChunks(diffQuery.query, diffQuery.sort, boost::none, &chunks)); int diffsApplied = differ.calculateConfigDiff(chunks); if (diffsApplied > 0) { LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << _ns << " with version " << _version; // Add all existing shards we find to the shards set for (ShardVersionMap::iterator it = shardVersions->begin(); it != shardVersions->end();) { shared_ptr<Shard> shard = grid.shardRegistry()->getShard(it->first); if (shard) { shardIds.insert(it->first); ++it; } else { shardVersions->erase(it++); } } return true; } else if (diffsApplied == 0) { // No chunks were found for the ns warning() << "no chunks found when reloading " << _ns << ", previous version was " << _version; // Set all our data to empty chunkMap.clear(); shardVersions->clear(); _version = ChunkVersion(0, 0, OID()); return true; } else { // diffsApplied < 0 bool allInconsistent = (differ.numValidDiffs() == 0); if (allInconsistent) { // All versions are different, this can be normal warning() << "major change in chunk information found when reloading " << _ns << ", previous version was " << _version; } else { // Inconsistent load halfway through (due to yielding cursor during load) // should be rare warning() << "inconsistent chunks found when reloading " << _ns << ", previous version was " << _version << ", this should be rare"; } // Set all our data to empty to be extra safe chunkMap.clear(); shardVersions->clear(); _version = ChunkVersion(0, 0, OID()); return allInconsistent; } }
Status ShardingState::refreshMetadataNow( const string& ns, ChunkVersion* latestShardVersion ) { return doRefreshMetadata( ns, ChunkVersion( 0, 0, OID() ), false, latestShardVersion ); }
int ConfigDiffTracker<ValType, ShardType>::calculateConfigDiff( const std::vector<ChunkType>& chunks) { _assertAttached(); // Apply the chunk changes to the ranges and versions // // Overall idea here is to work in two steps : // 1. For all the new chunks we find, increment the maximum version per-shard and // per-collection, and remove any conflicting chunks from the ranges. // 2. For all the new chunks we're interested in (all of them for mongos, just chunks on // the shard for mongod) add them to the ranges. std::vector<ChunkType> newTracked; // Store epoch now so it doesn't change when we change max OID currEpoch = _maxVersion->epoch(); _validDiffs = 0; for (const ChunkType& chunk : chunks) { ChunkVersion chunkVersion = ChunkVersion::fromBSON(chunk.toBSON(), ChunkType::DEPRECATED_lastmod()); if (!chunkVersion.isSet() || !chunkVersion.hasEqualEpoch(currEpoch)) { warning() << "got invalid chunk version " << chunkVersion << " in document " << chunk.toString() << " when trying to load differing chunks at version " << ChunkVersion( _maxVersion->majorVersion(), _maxVersion->minorVersion(), currEpoch); // Don't keep loading, since we know we'll be broken here return -1; } _validDiffs++; // Get max changed version and chunk version if (chunkVersion > *_maxVersion) { *_maxVersion = chunkVersion; } // Chunk version changes ShardType shard = shardFor(chunk.getShard()); typename MaxChunkVersionMap::const_iterator shardVersionIt = _maxShardVersions->find(shard); if (shardVersionIt == _maxShardVersions->end() || shardVersionIt->second < chunkVersion) { (*_maxShardVersions)[shard] = chunkVersion; } // See if we need to remove any chunks we are currently tracking because of this // chunk's changes removeOverlapping(chunk.getMin(), chunk.getMax()); // Figure out which of the new chunks we need to track // Important - we need to actually own this doc, in case the cursor decides to getMore // or unbuffer. if (isTracked(chunk)) { newTracked.push_back(chunk); } } LOG(3) << "found " << _validDiffs << " new chunks for collection " << _ns << " (tracking " << newTracked.size() << "), new version is " << *_maxVersion; for (const ChunkType& chunk : newTracked) { // Invariant enforced by sharding - it's possible to read inconsistent state due to // getMore and yielding, so we want to detect it as early as possible. // // TODO: This checks for overlap, we also should check for holes here iff we're // tracking all chunks. if (isOverlapping(chunk.getMin(), chunk.getMax())) { return -1; } _currMap->insert(rangeFor(chunk)); } return _validDiffs; }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){ result.append( "initialized", true ); // Send back wire version to let mongos know what protocol we can speak result.append( "minWireVersion", minWireVersion ); result.append( "maxWireVersion", maxWireVersion ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } if( ! ChunkVersion::canParseBSON( cmdObj, "version" ) ){ errmsg = "need to specify version"; return false; } const ChunkVersion version = ChunkVersion::fromBSON( cmdObj, "version" ); // step 3 const ChunkVersion oldVersion = info->getVersion(ns); const ChunkVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isWriteCompatibleWith( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isWriteCompatibleWith( version ) ) { if ( oldVersion < globalVersion && oldVersion.hasCompatibleEpoch(globalVersion) ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } // Cases below all either return OR fall-through to remote metadata reload. if ( version.isSet() || !globalVersion.isSet() ) { // Not Dropping // TODO: Refactor all of this if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } // TODO: Refactor all of this if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } // Fall through to metadata reload below } else { // Dropping if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } // Fall through to metadata reload below } ChunkVersion currVersion; Status status = shardingState.refreshMetadataIfNeeded( ns, version, &currVersion ); if (!status.isOK()) { // The reload itself was interrupted or confused here errmsg = str::stream() << "could not refresh metadata for " << ns << " with requested shard version " << version.toString() << ", stored shard version is " << currVersion.toString() << causedBy( status.reason() ); warning() << errmsg << endl; result.append( "ns" , ns ); version.addToBSON( result, "version" ); currVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig", true ); return false; } else if ( !version.isWriteCompatibleWith( currVersion ) ) { // We reloaded a version that doesn't match the version mongos was trying to // set. errmsg = str::stream() << "requested shard version differs from" << " config shard version for " << ns << ", requested version is " << version.toString() << " but found version " << currVersion.toString(); OCCASIONALLY warning() << errmsg << endl; // WARNING: the exact fields below are important for compatibility with mongos // version reload. result.append( "ns" , ns ); currVersion.addToBSON( result, "globalVersion" ); // If this was a reset of a collection or the last chunk moved out, inform mongos to // do a full reload. if (currVersion.epoch() != version.epoch() || !currVersion.isSet() ) { result.appendBool( "reloadConfig", true ); // Zero-version also needed to trigger full mongos reload, sadly // TODO: Make this saner, and less impactful (full reload on last chunk is bad) ChunkVersion( 0, 0, OID() ).addToBSON( result, "version" ); // For debugging version.addToBSON( result, "origVersion" ); } else { version.addToBSON( result, "version" ); } return false; } info->setVersion( ns , version ); return true; }