CollectionManager* MetadataLoader::makeEmptyCollectionManager() { CollectionManager* manager = new CollectionManager; manager->_maxCollVersion = ShardChunkVersion(1, 0, OID()); manager->_maxShardVersion = ShardChunkVersion(1, 0, OID()); dassert(manager->isValid()); return manager; }
void Strategy::insert( const Shard& shard , const char * ns , const vector<BSONObj>& v , int flags, bool safe ) { ShardConnection dbcon( shard , ns ); if ( dbcon.setVersion() ) { dbcon.done(); // Version is zero b/c we don't yet have a way to get the local version conflict throw RecvStaleConfigException( ns , "for insert", ShardChunkVersion( 0 ), ShardChunkVersion( 0 ) ); } dbcon->insert( ns , v , flags); if (safe) dbcon->getLastError(); dbcon.done(); }
void Strategy::update( const Shard& shard , const char * ns , const BSONObj& query , const BSONObj& toupdate , int flags, bool safe ) { bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; ShardConnection dbcon( shard , ns ); if ( dbcon.setVersion() ) { dbcon.done(); // Version is zero b/c we don't yet have a way to get the local version conflict throw RecvStaleConfigException( ns , "for insert", ShardChunkVersion( 0 ), ShardChunkVersion( 0 ) ); } dbcon->update( ns , query , toupdate, upsert, multi); if (safe) dbcon->getLastError(); dbcon.done(); }
void Strategy::doQuery( Request& r , const Shard& shard ) { r.checkAuth( Auth::READ ); ShardConnection dbcon( shard , r.getns() ); DBClientBase &c = dbcon.conn(); string actualServer; Message response; bool ok = c.call( r.m(), response, true , &actualServer ); uassert( 10200 , "mongos: error calling db", ok ); { QueryResult *qr = (QueryResult *) response.singleData(); if ( qr->resultFlags() & ResultFlag_ShardConfigStale ) { dbcon.done(); // Version is zero b/c this is deprecated codepath throw RecvStaleConfigException( r.getns() , "Strategy::doQuery", ShardChunkVersion( 0 ), ShardChunkVersion( 0 ) ); } } r.reply( response , actualServer.size() ? actualServer : c.getServerAddress() ); dbcon.done(); }
ShardChunkManager* ShardChunkManager::cloneMinus( const BSONObj& min, const BSONObj& max, const ShardChunkVersion& version ) { // check that we have the exact chunk that will be subtracted _assertChunkExists( min , max ); auto_ptr<ShardChunkManager> p( new ShardChunkManager ); p->_key = this->_key; if ( _chunksMap.size() == 1 ) { // if left with no chunks, just reset version uassert( 13590 , str::stream() << "setting version to " << version.toString() << " on removing last chunk", ! version.isSet() ); p->_version = ShardChunkVersion( 0, OID() ); p->_collVersion = _collVersion; } else { // can't move version backwards when subtracting chunks // this is what guarantees that no read or write would be taken once we subtract data from the current shard if ( version <= _version ) { uasserted( 13585 , str::stream() << "version " << version.toString() << " not greater than " << _version.toString() ); } p->_chunksMap = this->_chunksMap; p->_chunksMap.erase( min ); p->_version = version; if( version > _collVersion ) p->_collVersion = version; else p->_collVersion = this->_collVersion; p->_fillRanges(); } return p.release(); }
void Strategy::doWrite( int op , Request& r , const Shard& shard , bool checkVersion ) { ShardConnection conn( shard , r.getns() ); if ( ! checkVersion ) conn.donotCheckVersion(); else if ( conn.setVersion() ) { conn.done(); // Version is zero b/c we don't yet have a way to get the local version conflict throw RecvStaleConfigException( r.getns() , "doWrite" , true, ShardChunkVersion( 0 ), ShardChunkVersion( 0 ) ); } conn->say( r.m() ); conn.done(); }
void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); assert( it != _chunks.end() ) ; ShardChunkManagerPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ShardChunkVersion( 0 , 0 ); ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) ); _chunks[ns] = cloned; }
/** * @return true if had to do something */ bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) { // TODO: cache, optimize, etc... WriteBackListener::init( *conn_in ); DBConfigPtr conf = grid.getDBConfig( ns ); if ( ! conf ) return false; DBClientBase* conn = getVersionable( conn_in ); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ChunkManagerPtr manager; const bool isSharded = conf->isSharded( ns ); if ( isSharded ) { manager = conf->getChunkManagerIfExists( ns , authoritative ); // It's possible the chunk manager was reset since we checked whether sharded was true, // so must check this here. if( manager ) officialSequenceNumber = manager->getSequenceNumber(); } // Check this manager against the reference manager if( isSharded && manager ){ Shard shard = Shard::make( conn->getServerAddress() ); if( refManager && ! refManager->compatibleWith( manager, shard ) ){ throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard ).toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refManager->getVersion( shard ).toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")", refManager->getVersion( shard ), manager->getVersion( shard ) ); } } else if( refManager ){ Shard shard = Shard::make( conn->getServerAddress() ); string msg( str::stream() << "not sharded (" << ( (manager.get() == 0) ? string( "<none>" ) : str::stream() << manager->getSequenceNumber() ) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")" ); throw SendStaleConfigException( ns, msg, refManager->getVersion( shard ), ShardChunkVersion( 0, OID() )); } // has the ChunkManager been reloaded since the last time we updated the connection-level version? // (ie., last time we issued the setShardVersions below) unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns); if ( sequenceNumber == officialSequenceNumber ) { return false; } ShardChunkVersion version = ShardChunkVersion( 0, OID() ); if ( isSharded && manager ) { version = manager->getVersion( Shard::make( conn->getServerAddress() ) ); } if( ! version.isSet() ){ LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " << ( ! isSharded ? "no longer sharded" : ( ! manager ? "no chunk manager found" : "version is zero" ) ) << endl; } LOG(2) << " have to set shard version for conn: " << conn->getServerAddress() << " ns:" << ns << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber << " version: " << version << " manager: " << manager.get() << endl; const string versionableServerAddress(conn->getServerAddress()); BSONObj result; if ( setShardVersion( *conn , ns , version , authoritative , result ) ) { // success! LOG(1) << " setShardVersion success: " << result << endl; connectionShardStatus.setSequence( conn , ns , officialSequenceNumber ); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if ( result["need_authoritative"].trueValue() ) massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); if ( ! authoritative ) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if ( result["reloadConfig"].trueValue() ) { if( result["version"].timestampTime() == 0 ){ warning() << "reloading full configuration for " << conf->getName() << ", connection state indicates significant version changes" << endl; // reload db conf->reload(); } else { // reload config conf->getChunkManager( ns , true ); } } const int maxNumTries = 7; if ( tryNumber < maxNumTries ) { LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) << "going to retry checkShardVersion host: " << versionableServerAddress << " " << result << endl; sleepmillis( 10 * tryNumber ); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed host: " << versionableServerAddress << " " << result; log() << " " << errmsg << endl; massert( 10429 , errmsg , 0 ); return true; }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ) { result.append( "initialized", true ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } const ConfigVersion version = ConfigVersion( extractVersion( cmdObj["version"] , errmsg ), OID() ); if ( errmsg.size() ) return false; // step 3 const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isEquivalentTo( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isEquivalentTo( version ) ) { if ( oldVersion < globalVersion ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); Lock::GlobalWrite setShardVersionLock; // TODO: can we get rid of this?? if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); return true; } if ( ! version.isSet() && globalVersion.isSet() ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; globalVersion.addToBSON( result, "beforeDrop" ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); return true; } if ( version < oldVersion ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. // TODO: Maybe a more elegant way of doing this while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section for version reset" << endl; } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } Timer relockTime; { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for collection '" << ns << "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } } if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) { log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl; } info->setVersion( ns , version ); return true; }
bool MetadataLoader::initCollection(const string& ns, const string& shard, const CollectionManager* oldManager, CollectionManager* manager, string* errMsg) { // // Bring collection entry from the config server. // BSONObj collObj; { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset( ScopedDbConnection::getInternalScopedDbConnection(_configLoc.toString(), 30)); ScopedDbConnection& conn = *connPtr; collObj = conn->findOne(CollectionType::ConfigNS, QUERY(CollectionType::ns()<<ns)); } catch (const DBException& e) { *errMsg = str::stream() << "caught exception accessing the config servers " << causedBy(e); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return false; } connPtr->done(); } CollectionType collDoc; if (!collDoc.parseBSON(collObj, errMsg) || !collDoc.isValid(errMsg)) { return false; } // // Load or generate default chunks for collection config. // if (!collDoc.getKeyPattern().isEmpty()) { manager->_key = collDoc.getKeyPattern(); if(!initChunks(collDoc, ns, shard, oldManager, manager, errMsg)){ return false; } } else if(collDoc.getPrimary() == shard) { if (shard == "") { warning() << "shard not verified, assuming collection " << ns << " is unsharded on this shard" << endl; } manager->_key = BSONObj(); manager->_maxShardVersion = ShardChunkVersion(1, 0, collDoc.getEpoch()); manager->_maxCollVersion = manager->_maxShardVersion; } else { *errMsg = str::stream() << "collection " << ns << " does not have a shard key " << "and primary " << collDoc.getPrimary() << " does not match this shard " << shard; return false; } return true; }
bool MetadataLoader::initChunks(const CollectionType& collDoc, const string& ns, const string& shard, const CollectionManager* oldManager, CollectionManager* manager, string* errMsg) { map<string,ShardChunkVersion> versionMap; manager->_maxCollVersion = ShardChunkVersion(0, 0, collDoc.getEpoch()); // Check to see if we should use the old version or not. if (oldManager) { ShardChunkVersion oldVersion = oldManager->getMaxShardVersion(); if (oldVersion.isSet() && oldVersion.hasCompatibleEpoch(collDoc.getEpoch())) { // Our epoch for coll version and shard version should be the same. verify(oldManager->getMaxCollVersion().hasCompatibleEpoch(collDoc.getEpoch())); versionMap[shard] = oldManager->_maxShardVersion; manager->_maxCollVersion = oldManager->_maxCollVersion; // TODO: This could be made more efficient if copying not required, but // not as frequently reloaded as in mongos. manager->_chunksMap = oldManager->_chunksMap; LOG(2) << "loading new chunks for collection " << ns << " using old chunk manager w/ version " << oldManager->getMaxShardVersion() << " and " << manager->_chunksMap.size() << " chunks" << endl; } } // Exposes the new 'manager's range map and version to the "differ," who // would ultimately be responsible of filling them up. SCMConfigDiffTracker differ(shard); differ.attach(ns, manager->_chunksMap, manager->_maxCollVersion, versionMap); try { scoped_ptr<ScopedDbConnection> connPtr( ScopedDbConnection::getInternalScopedDbConnection(_configLoc.toString(), 30)); ScopedDbConnection& conn = *connPtr; auto_ptr<DBClientCursor> cursor = conn->query(ChunkType::ConfigNS, differ.configDiffQuery()); if (!cursor.get()) { // 'errMsg' was filled by the getChunkCursor() call. manager->_maxCollVersion = ShardChunkVersion(); manager->_chunksMap.clear(); connPtr->done(); return false; } // Diff tracker should *always* find at least one chunk if collection exists. int diffsApplied = differ.calculateConfigDiff(*cursor); if (diffsApplied > 0) { LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << ns << " with version " << manager->_maxCollVersion << endl; manager->_maxShardVersion = versionMap[shard]; manager->fillRanges(); connPtr->done(); return true; } else if(diffsApplied == 0) { *errMsg = str::stream() << "no chunks found when reloading " << ns << ", previous version was " << manager->_maxCollVersion.toString(); warning() << *errMsg << endl; manager->_maxCollVersion = ShardChunkVersion(); manager->_chunksMap.clear(); connPtr->done(); return false; } else{ // TODO: make this impossible by making sure we don't migrate / split on this // shard during the reload. No chunks were found for the ns. *errMsg = str::stream() << "invalid chunks found when reloading " << ns << ", previous version was " << manager->_maxCollVersion.toString() << ", this should be rare"; warning() << errMsg << endl; manager->_maxCollVersion = ShardChunkVersion(); manager->_chunksMap.clear(); connPtr->done(); return false; } } catch (const DBException& e) { *errMsg = str::stream() << "caught exception accessing the config servers" << causedBy(e); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return false; } }
void ShardChunkManager::_init( const string& configServer , const string& ns , const string& shardName, ShardChunkManagerPtr oldManager ) { // have to get a connection to the config db // special case if I'm the configdb since I'm locked and if I connect to myself // its a deadlock scoped_ptr<ScopedDbConnection> scoped; scoped_ptr<DBDirectClient> direct; DBClientBase * conn; if ( configServer.empty() ) { direct.reset( new DBDirectClient() ); conn = direct.get(); } else { scoped.reset( ScopedDbConnection::getInternalScopedDbConnection( configServer, 30.0 ) ); conn = scoped->get(); } // get this collection's sharding key BSONObj collectionDoc = conn->findOne( "config.collections", BSON( "_id" << ns ) ); if( collectionDoc.isEmpty() ){ warning() << ns << " does not exist as a sharded collection" << endl; return; } if( collectionDoc["dropped"].Bool() ){ warning() << ns << " was dropped. Re-shard collection first." << endl; return; } _fillCollectionKey( collectionDoc ); map<string,ShardChunkVersion> versionMap; versionMap[ shardName ] = _version; _collVersion = ShardChunkVersion( 0, OID() ); // Check to see if we have an old ShardChunkManager to use if( oldManager && oldManager->_collVersion.isSet() ){ versionMap[ shardName ] = oldManager->_version; _collVersion = oldManager->_collVersion; // TODO: This could be made more efficient if copying not required, but not as // frequently reloaded as in mongos. _chunksMap = oldManager->_chunksMap; LOG(2) << "loading new chunks for collection " << ns << " using old chunk manager w/ version " << _collVersion << " and " << _chunksMap.size() << " chunks" << endl; } // Attach our config diff tracker to our range map and versions SCMConfigDiffTracker differ( shardName ); differ.attach( ns, _chunksMap, _collVersion, versionMap ); // Need to do the query ourselves, since we may use direct conns to the db Query query = differ.configDiffQuery(); auto_ptr<DBClientCursor> cursor = conn->query( "config.chunks" , query ); uassert( 16181, str::stream() << "could not initialize cursor to config server chunks collection for ns " << ns, cursor.get() ); // Diff tracker should *always* find at least one chunk if collection exists int diffsApplied = differ.calculateConfigDiff( *cursor ); if( diffsApplied > 0 ){ LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << ns << " with version " << _collVersion << endl; // Save the new version of this shard _version = versionMap[ shardName ]; _fillRanges(); } else if( diffsApplied == 0 ){ // No chunks were found for the ns warning() << "no chunks found when reloading " << ns << ", previous version was " << _collVersion << endl; _version = ShardChunkVersion( 0, OID() ); _collVersion = ShardChunkVersion( 0, OID() ); _chunksMap.clear(); } else{ // TODO: make this impossible by making sure we don't migrate / split on this shard during the // reload // No chunks were found for the ns warning() << "invalid chunks found when reloading " << ns << ", previous version was " << _collVersion << ", this should be rare" << endl; // Handle the same way as a connectivity error, for now // TODO: handle inline uassert( 16229, str::stream() << "could not initialize cursor to config server chunks collection for ns " << ns, cursor.get() ); } if ( scoped.get() ) scoped->done(); if ( _chunksMap.empty() ) log() << "no chunk for collection " << ns << " on shard " << shardName << endl; }
Query ConfigDiffTracker<ValType,ShardType>:: configDiffQuery( const set<ShardChunkVersion>& extraMinorVersions ) const { verifyAttached(); // // Basic idea behind the query is to find all the chunks $gt the current max version, and // then also update chunks that we need minor versions - splits and (2.0) max chunks on // shards // static const int maxMinorVersionClauses = 50; BSONObjBuilder queryB; int numStaleMinorClauses = extraMinorVersions.size() + _maxShardVersions->size(); #ifdef _DEBUG // In debug builds, randomly trigger full reloads to exercise both codepaths if( rand() % 2 ) numStaleMinorClauses = maxMinorVersionClauses; #endif if( numStaleMinorClauses < maxMinorVersionClauses ){ BSONArrayBuilder queryOrB( queryB.subarrayStart( "$or" ) ); // // Get any version changes higher than we know currently // { BSONObjBuilder queryNewB( queryOrB.subobjStart() ); queryNewB.append( "ns", _ns ); { BSONObjBuilder ts( queryNewB.subobjStart( "lastmod" ) ); // We should *always* pull at least a single chunk back, this lets us quickly // detect if our collection was unsharded (and most of the time if it was // resharded) in the meantime ts.appendTimestamp( "$gte", _maxVersion->toLong() ); ts.done(); } queryNewB.done(); } // Get any shard version changes higher than we know currently // Needed since there could have been a split of the max version chunk of any shard // TODO: Ideally, we shouldn't care about these for( typename map<ShardType, ShardChunkVersion>::const_iterator it = _maxShardVersions->begin(); it != _maxShardVersions->end(); it++ ){ BSONObjBuilder queryShardB( queryOrB.subobjStart() ); queryShardB.append( "ns", _ns ); queryShardB.append( "shard", nameFrom( it->first ) ); { BSONObjBuilder ts( queryShardB.subobjStart( "lastmod" ) ); ts.appendTimestamp( "$gt", it->second.toLong() ); ts.done(); } queryShardB.done(); } // Get any minor version changes we've marked as interesting // TODO: Ideally we shouldn't care about these for( set<ShardChunkVersion>::const_iterator it = extraMinorVersions.begin(); it != extraMinorVersions.end(); it++ ){ BSONObjBuilder queryShardB( queryOrB.subobjStart() ); queryShardB.append( "ns", _ns ); { BSONObjBuilder ts( queryShardB.subobjStart( "lastmod" ) ); ts.appendTimestamp( "$gt", it->toLong() ); ts.appendTimestamp( "$lt", ShardChunkVersion( it->majorVersion() + 1, 0, OID() ).toLong() ); ts.done(); } queryShardB.done(); } queryOrB.done(); } else{ // // We don't want to send a giant $or query to the server, so just get all the chunks // queryB.append( "ns", _ns ); } BSONObj query = queryB.obj(); // log() << "major version query from " << *_maxVersion << " and over " << _maxShardVersions->size() << " shards is " << query << endl; return Query( query ); }
int ConfigDiffTracker<ValType,ShardType>:: calculateConfigDiff( DBClientCursorInterface& diffCursor ) { verifyAttached(); // Apply the chunk changes to the ranges and versions // // Overall idea here is to work in two steps : // 1. For all the new chunks we find, increment the maximum version per-shard and // per-collection, and remove any conflicting chunks from the ranges // 2. For all the new chunks we're interested in (all of them for mongos, just chunks on the // shard for mongod) add them to the ranges // vector<BSONObj> newTracked; // Store epoch now so it doesn't change when we change max OID currEpoch = _maxVersion->epoch(); _validDiffs = 0; while( diffCursor.more() ){ BSONObj diffChunkDoc = diffCursor.next(); ShardChunkVersion chunkVersion = ShardChunkVersion::fromBSON( diffChunkDoc, "lastmod" ); if( diffChunkDoc[ "min" ].type() != Object || diffChunkDoc[ "max" ].type() != Object || diffChunkDoc[ "shard" ].type() != String ) { warning() << "got invalid chunk document " << diffChunkDoc << " when trying to load differing chunks" << endl; continue; } if( ! chunkVersion.isSet() || ! chunkVersion.hasCompatibleEpoch( currEpoch ) ){ warning() << "got invalid chunk version " << chunkVersion << " in document " << diffChunkDoc << " when trying to load differing chunks at version " << ShardChunkVersion( _maxVersion->toLong(), currEpoch ) << endl; // Don't keep loading, since we know we'll be broken here return -1; } _validDiffs++; // Get max changed version and chunk version if( chunkVersion > *_maxVersion ) *_maxVersion = chunkVersion; // Chunk version changes ShardType shard = shardFor( diffChunkDoc[ "shard" ].String() ); typename map<ShardType, ShardChunkVersion>::iterator shardVersionIt = _maxShardVersions->find( shard ); if( shardVersionIt == _maxShardVersions->end() || shardVersionIt->second < chunkVersion ){ (*_maxShardVersions)[ shard ] = chunkVersion; } // See if we need to remove any chunks we are currently tracking b/c of this chunk's changes removeOverlapping( diffChunkDoc[ "min" ].Obj(), diffChunkDoc[ "max" ].Obj() ); // Figure out which of the new chunks we need to track // Important - we need to actually own this doc, in case the cursor decides to getMore or unbuffer if( isTracked( diffChunkDoc ) ) newTracked.push_back( diffChunkDoc.getOwned() ); } LOG(3) << "found " << _validDiffs << " new chunks for collection " << _ns << " (tracking " << newTracked.size() << "), new version is " << _maxVersion << endl; for( vector<BSONObj>::iterator it = newTracked.begin(); it != newTracked.end(); it++ ){ BSONObj chunkDoc = *it; // Important - we need to make sure we actually own the min and max here BSONObj min = chunkDoc[ "min" ].Obj().getOwned(); BSONObj max = chunkDoc[ "max" ].Obj().getOwned(); // Invariant enforced by sharding // It's possible to read inconsistent state b/c of getMore() and yielding, so we want // to detect as early as possible. // TODO: This checks for overlap, we also should check for holes here iff we're tracking // all chunks if( isOverlapping( min, max ) ) return -1; _currMap->insert( rangeFor( chunkDoc, min, max ) ); } return _validDiffs; }
void run() { int numShards = 10; int numInitialChunks = 5; int maxChunks = 100000; // Needed to not overflow the BSONArray's max bytes int keySize = 2; BSONArrayBuilder chunksB; BSONObj lastSplitPt; ShardChunkVersion version( 1, 0, OID() ); // // Generate numChunks with a given key size over numShards // All chunks have double key values, so we can split them a bunch // for( int i = -1; i < numInitialChunks; i++ ){ BSONObjBuilder splitPtB; for( int k = 0; k < keySize; k++ ){ string field = string( "k" ) + string( 1, (char)('0' + k) ); if( i < 0 ) splitPtB.appendMinKey( field ); else if( i < numInitialChunks - 1 ) splitPtB.append( field, (double)i ); else splitPtB.appendMaxKey( field ); } BSONObj splitPt = splitPtB.obj(); if( i >= 0 ){ BSONObjBuilder chunkB; chunkB.append( "min", lastSplitPt ); chunkB.append( "max", splitPt ); int shardNum = rand( numShards ); chunkB.append( "shard", "shard" + string( 1, (char)('A' + shardNum) ) ); rand( 2 ) ? version.incMajor() : version.incMinor(); version.addToBSON( chunkB, "lastmod" ); chunksB.append( chunkB.obj() ); } lastSplitPt = splitPt; } BSONArray chunks = chunksB.arr(); // log() << "Chunks generated : " << chunks << endl; DBClientMockCursor chunksCursor( chunks ); // Setup the empty ranges and versions first RangeMap ranges; ShardChunkVersion maxVersion = ShardChunkVersion( 0, 0, OID() ); VersionMap maxShardVersions; // Create a differ which will track our progress boost::shared_ptr< DefaultDiffAdapter > differ( _inverse ? new InverseDiffAdapter() : new DefaultDiffAdapter() ); differ->attach( "test", ranges, maxVersion, maxShardVersions ); // Validate initial load differ->calculateConfigDiff( chunksCursor ); validate( chunks, ranges, maxVersion, maxShardVersions ); // Generate a lot of diffs, and keep validating that updating from the diffs always // gives us the right ranges and versions int numDiffs = 135; // Makes about 100000 chunks overall int numChunks = numInitialChunks; for( int i = 0; i < numDiffs; i++ ){ // log() << "Generating new diff... " << i << endl; BSONArrayBuilder diffsB; BSONArrayBuilder newChunksB; BSONObjIterator chunksIt( chunks ); while( chunksIt.more() ){ BSONObj chunk = chunksIt.next().Obj(); int randChoice = rand( 10 ); if( randChoice < 2 && numChunks < maxChunks ){ // Simulate a split // log() << " ...starting a split with chunk " << chunk << endl; BSONObjBuilder leftB; BSONObjBuilder rightB; BSONObjBuilder midB; for( int k = 0; k < keySize; k++ ){ string field = string( "k" ) + string( 1, (char)('0' + k) ); BSONType maxType = chunk["max"].Obj()[field].type(); double max = maxType == NumberDouble ? chunk["max"].Obj()[field].Number() : 0.0; BSONType minType = chunk["min"].Obj()[field].type(); double min = minType == NumberDouble ? chunk["min"].Obj()[field].Number() : 0.0; if( minType == MinKey ){ midB.append( field, max - 1.0 ); } else if( maxType == MaxKey ){ midB.append( field, min + 1.0 ); } else { midB.append( field, ( max + min ) / 2.0 ); } } BSONObj midPt = midB.obj(); // Only happens if we can't split the min chunk if( midPt.isEmpty() ) continue; leftB.append( chunk["min"] ); leftB.append( "max", midPt ); rightB.append( "min", midPt ); rightB.append( chunk["max"] ); leftB.append( chunk["shard"] ); rightB.append( chunk["shard"] ); version.incMajor(); version._minor = 0; version.addToBSON( leftB, "lastmod" ); version.incMinor(); version.addToBSON( rightB, "lastmod" ); BSONObj left = leftB.obj(); BSONObj right = rightB.obj(); // log() << " ... split into " << left << " and " << right << endl; newChunksB.append( left ); newChunksB.append( right ); diffsB.append( right ); diffsB.append( left ); numChunks++; } else if( randChoice < 4 && chunksIt.more() ){ // Simulate a migrate // log() << " ...starting a migrate with chunk " << chunk << endl; BSONObj prevShardChunk; while( chunksIt.more() ){ prevShardChunk = chunksIt.next().Obj(); if( prevShardChunk["shard"].String() == chunk["shard"].String() ) break; // log() << "... appending chunk from diff shard: " << prevShardChunk << endl; newChunksB.append( prevShardChunk ); prevShardChunk = BSONObj(); } // We need to move between different shards, hence the weirdness in logic here if( ! prevShardChunk.isEmpty() ){ BSONObjBuilder newShardB; BSONObjBuilder prevShardB; newShardB.append( chunk["min"] ); newShardB.append( chunk["max"] ); prevShardB.append( prevShardChunk["min"] ); prevShardB.append( prevShardChunk["max"] ); int shardNum = rand( numShards ); newShardB.append( "shard", "shard" + string( 1, (char)('A' + shardNum) ) ); prevShardB.append( prevShardChunk["shard"] ); version.incMajor(); version._minor = 0; version.addToBSON( newShardB, "lastmod" ); version.incMinor(); version.addToBSON( prevShardB, "lastmod" ); BSONObj newShard = newShardB.obj(); BSONObj prevShard = prevShardB.obj(); // log() << " ... migrated to " << newShard << " and updated " << prevShard << endl; newChunksB.append( newShard ); newChunksB.append( prevShard ); diffsB.append( newShard ); diffsB.append( prevShard ); } else{ // log() << "... appending chunk, no more left: " << chunk << endl; newChunksB.append( chunk ); } } else{ // log() << "Appending chunk : " << chunk << endl; newChunksB.append( chunk ); } } BSONArray diffs = diffsB.arr(); chunks = newChunksB.arr(); // log() << "Diffs generated : " << diffs << endl; // log() << "All chunks : " << chunks << endl; // Rarely entirely clear out our data if( rand( 10 ) < 1 ){ diffs = chunks; ranges.clear(); maxVersion = ShardChunkVersion( 0, 0, OID() ); maxShardVersions.clear(); } // log() << "Total number of chunks : " << numChunks << " iteration " << i << endl; DBClientMockCursor diffCursor( diffs ); differ->calculateConfigDiff( diffCursor ); validate( chunks, ranges, maxVersion, maxShardVersions ); } }