bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj["getShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify full namespace"; return false; } result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns).toLong() ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); result.appendBool( "inShardedMode" , info != 0 ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns).toLong() ); else result.appendTimestamp( "mine" , 0 ); if ( cmdObj["fullMetadata"].trueValue() ) { CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( ns ); if ( metadata ) result.append( "metadata", metadata->toBSON() ); else result.append( "metadata", BSONObj() ); } return true; }
bool checkConfigOrInit( const string& configdb , bool authoritative , string& errmsg , BSONObjBuilder& result , bool locked=false ) const { if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb == shardingState.getConfigServer() ) return true; result.append( "configdb" , BSON( "stored" << shardingState.getConfigServer() << "given" << configdb ) ); errmsg = str::stream() << "mongos specified a different config database string : " << "stored : " << shardingState.getConfigServer() << " vs given : " << configdb; return false; } if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } if ( locked ) { ShardedConnectionInfo::addHook(); shardingState.enable( configdb ); configServer.init( configdb ); return true; } Lock::GlobalWrite lk; return checkConfigOrInit( configdb , authoritative , errmsg , result , true ); }
bool checkConfigOrInit( const string& configdb , bool authoritative , string& errmsg , BSONObjBuilder& result , bool locked=false ) const { if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb == shardingState.getConfigServer() ) return true; result.append( "configdb" , BSON( "stored" << shardingState.getConfigServer() << "given" << configdb ) ); errmsg = "specified a different configdb!"; return false; } if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } if ( locked ) { shardingState.enable( configdb ); configServer.init( configdb ); return true; } dblock lk; return checkConfigOrInit( configdb , authoritative , errmsg , result , true ); }
void logDeleteOpForSharding(OperationContext* txn, const char* ns, const BSONObj& obj, bool notInActiveChunk) { ShardingState* shardingState = ShardingState::get(txn); if (shardingState->enabled()) shardingState->migrationSourceManager()->logDeleteOp(txn, ns, obj, notInActiveChunk); }
/** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , bool isWriteOp , string& errmsg ) { if ( ! shardingState.enabled() ) return true; ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ) { return true; } // TODO // all collections at some point, be sharded or not, will have a version (and a ShardChunkManager) // for now, we remove the sharding state of dropped collection // so delayed request may come in. This has to be fixed. ConfigVersion clientVersion = info->getVersion(ns); ConfigVersion version; if ( ! shardingState.hasVersion( ns , version ) && clientVersion == 0 ) { return true; } if ( version == 0 && clientVersion > 0 ) { stringstream ss; ss << "collection was dropped or this shard no longer valied version: " << version << " clientVersion: " << clientVersion; errmsg = ss.str(); return false; } if ( clientVersion >= version ) return true; if ( clientVersion == 0 ) { stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection: " << ns << " myVersion: " << version; errmsg = ss.str(); return false; } if ( version.majorVersion() == clientVersion.majorVersion() ) { // this means there was just a split // since on a split w/o a migrate this server is ok // going to accept return true; } stringstream ss; ss << "your version is too old ns: " + ns << " global: " << version << " client: " << clientVersion; errmsg = ss.str(); return false; }
bool haveLocalShardingInfo( const string& ns ) { if ( ! shardingState.enabled() ) return false; if ( ! shardingState.hasVersion( ns ) ) return false; return ShardedConnectionInfo::get(false) > 0; }
/** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , bool isWriteOp , string& errmsg ){ if ( ! shardingState.enabled() ) return true; ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ){ // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ){ return true; } ConfigVersion version; if ( ! shardingState.hasVersion( ns , version ) ){ return true; } ConfigVersion clientVersion = info->getVersion(ns); if ( version == 0 && clientVersion > 0 ){ stringstream ss; ss << "collection was dropped or this shard no longer valied version: " << version << " clientVersion: " << clientVersion; errmsg = ss.str(); return false; } if ( clientVersion >= version ) return true; if ( clientVersion == 0 ){ stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection: " << ns << " myVersion: " << version; errmsg = ss.str(); return false; } if ( isWriteOp && version.majorVersion() == clientVersion.majorVersion() ){ // this means there was just a split // since on a split w/o a migrate this server is ok // going to accept write return true; } stringstream ss; ss << "your version is too old ns: " + ns << " global: " << version << " client: " << clientVersion; errmsg = ss.str(); return false; }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj["getShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns) ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns) ); else result.appendTimestamp( "mine" , 0 ); return true; }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ // Debugging code for SERVER-1633. Commands have already a coarser timer for // normal operation. Timer timer; vector<int> laps; lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); bool authoritative = cmdObj.getBoolField( "authoritative" ); string configdb = cmdObj["configdb"].valuestrsafe(); { // configdb checking if ( configdb.size() == 0 ){ errmsg = "no configdb"; return false; } if ( shardingState.enabled() ){ if ( configdb != shardingState.getConfigServer() ){ errmsg = "specified a different configdb!"; return false; } } else { if ( ! authoritative ){ result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } shardingState.enable( configdb ); configServer.init( configdb ); } } // SERVER-1633 laps.push_back( timer.millis() ); if ( cmdObj["shard"].type() == String ){ shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } { // setting up ids if ( cmdObj["serverID"].type() != jstOID ){ // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; } else { OID clientId = cmdObj["serverID"].__oid(); if ( ! info->hasID() ){ info->setID( clientId ); } else if ( clientId != info->getID() ){ errmsg = "server id has changed!"; return 0; } } } // SERVER-1633 laps.push_back( timer.millis() ); unsigned long long version = extractVersion( cmdObj["version"] , errmsg ); if ( errmsg.size() ){ return false; } string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ){ errmsg = "need to speciy fully namespace"; return false; } ConfigVersion& oldVersion = info->getVersion(ns); ConfigVersion& globalVersion = shardingState.getVersion(ns); if ( oldVersion > 0 && globalVersion == 0 ){ // this had been reset oldVersion = 0; } if ( version == 0 && globalVersion == 0 ){ // this connection is cleaning itself oldVersion = 0; return 1; } // SERVER-1633 laps.push_back( timer.millis() ); if ( version == 0 && globalVersion > 0 ){ if ( ! authoritative ){ result.appendBool( "need_authoritative" , true ); result.appendTimestamp( "globalVersion" , globalVersion ); result.appendTimestamp( "oldVersion" , oldVersion ); errmsg = "dropping needs to be authoritative"; return 0; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data globalVersion = 0; oldVersion = 0; return 1; } if ( version < oldVersion ){ errmsg = "you already have a newer version"; result.appendTimestamp( "oldVersion" , oldVersion ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } // SERVER-1633 laps.push_back( timer.millis() ); if ( version < globalVersion ){ while ( shardingState.inCriticalMigrateSection() ){ dbtemprelease r; sleepmillis(2); log() << "waiting till out of critical section" << endl; } errmsg = "going to older version for global"; result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ){ // need authoritative for first look result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); errmsg = "first time for this ns"; return false; } // SERVER-1633 laps.push_back( timer.millis() ); { dbtemprelease unlock; shardingState.getChunkMatcher( ns ); } result.appendTimestamp( "oldVersion" , oldVersion ); oldVersion = version; globalVersion = version; // SERVER-1633 ostringstream lapString; lapString << name /* command name */ << " partials: " ; for (size_t i = 1; i<laps.size(); ++i){ lapString << (laps[i] - laps[i-1]) / 1000 << " "; } lapString << endl; logIfSlow( timer, lapString.str() ); result.append( "ok" , 1 ); return 1; }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { shardingState.appendInfo( result ); return true; }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); bool authoritative = cmdObj.getBoolField( "authoritative" ); string configdb = cmdObj["configdb"].valuestrsafe(); { // configdb checking if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb != shardingState.getConfigServer() ) { errmsg = "specified a different configdb!"; return false; } } else { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } shardingState.enable( configdb ); configServer.init( configdb ); } } if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } { // setting up ids if ( cmdObj["serverID"].type() != jstOID ) { // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; } else { OID clientId = cmdObj["serverID"].__oid(); if ( ! info->hasID() ) { info->setID( clientId ); } else if ( clientId != info->getID() ) { errmsg = "server id has changed!"; return 0; } } } unsigned long long version = extractVersion( cmdObj["version"] , errmsg ); if ( errmsg.size() ) { return false; } string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); if ( oldVersion > 0 && globalVersion == 0 ) { // this had been reset info->setVersion( ns , 0 ); } if ( version == 0 && globalVersion == 0 ) { // this connection is cleaning itself info->setVersion( ns , 0 ); return true; } if ( version == 0 && globalVersion > 0 ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); result.appendTimestamp( "globalVersion" , globalVersion ); result.appendTimestamp( "oldVersion" , oldVersion ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , 0 ); return true; } if ( version < oldVersion ) { errmsg = "you already have a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "oldVersion" , oldVersion ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "going to older version for global for collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ) { // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for colleciton '" << ns << "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , currVersion ); return false; } } info->setVersion( ns , version ); result.appendTimestamp( "oldVersion" , oldVersion ); result.append( "ok" , 1 ); return true; }
namespace mongo { // -----ShardingState START ---- ShardingState::ShardingState() : _enabled(false) , _mutex( "ShardingState" ) { } void ShardingState::enable( const string& server ) { _enabled = true; assert( server.size() ); if ( _configServer.size() == 0 ) _configServer = server; else { assert( server == _configServer ); } } void ShardingState::gotShardName( const string& name ) { if ( _shardName.size() == 0 ) { // TODO SERVER-2299 verify the name is sound w.r.t IPs _shardName = name; return; } if ( _shardName == name ) return; stringstream ss; ss << "gotShardName different than what i had before " << " before [" << _shardName << "] " << " got [" << name << "] " ; uasserted( 13298 , ss.str() ); } void ShardingState::gotShardHost( string host ) { size_t slash = host.find( '/' ); if ( slash != string::npos ) host = host.substr( 0 , slash ); if ( _shardHost.size() == 0 ) { _shardHost = host; return; } if ( _shardHost == host ) return; stringstream ss; ss << "gotShardHost different than what i had before " << " before [" << _shardHost << "] " << " got [" << host << "] " ; uasserted( 13299 , ss.str() ); } void ShardingState::resetShardingState() { scoped_lock lk(_mutex); _enabled = false; _configServer.clear(); _shardName.clear(); _shardHost.clear(); _chunks.clear(); } // TODO we shouldn't need three ways for checking the version. Fix this. bool ShardingState::hasVersion( const string& ns ) { scoped_lock lk(_mutex); ChunkManagersMap::const_iterator it = _chunks.find(ns); return it != _chunks.end(); } bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ) { scoped_lock lk(_mutex); ChunkManagersMap::const_iterator it = _chunks.find(ns); if ( it == _chunks.end() ) return false; ShardChunkManagerPtr p = it->second; version = p->getVersion(); return true; } const ConfigVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it != _chunks.end() ) { ShardChunkManagerPtr p = it->second; return p->getVersion(); } else { return 0; } } void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); assert( it != _chunks.end() ) ; ShardChunkManagerPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ShardChunkVersion( 0 , 0 ); ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) ); _chunks[ns] = cloned; } void ShardingState::undoDonateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); assert( it != _chunks.end() ) ; ShardChunkManagerPtr p( it->second->clonePlus( min , max , version ) ); _chunks[ns] = p; } void ShardingState::splitChunk( const string& ns , const BSONObj& min , const BSONObj& max , const vector<BSONObj>& splitKeys , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); assert( it != _chunks.end() ) ; ShardChunkManagerPtr p( it->second->cloneSplit( min , max , splitKeys , version ) ); _chunks[ns] = p; } void ShardingState::resetVersion( const string& ns ) { scoped_lock lk( _mutex ); _chunks.erase( ns ); } bool ShardingState::trySetVersion( const string& ns , ConfigVersion& version /* IN-OUT */ ) { // fast path - requested version is at the same version as this chunk manager // // cases: // + this shard updated the version for a migrate's commit (FROM side) // a client reloaded chunk state from config and picked the newest version // + two clients reloaded // one triggered the 'slow path' (below) // when the second's request gets here, the version is already current { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it != _chunks.end() && it->second->getVersion() == version ) return true; } // slow path - requested version is different than the current chunk manager's, if one exists, so must check for // newest version in the config server // // cases: // + a chunk moved TO here // (we don't bump up the version on the TO side but the commit to config does use higher version) // a client reloads from config an issued the request // + there was a take over from a secondary // the secondary had no state (managers) at all, so every client request will fall here // + a stale client request a version that's not current anymore const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer; ShardChunkManagerPtr p( new ShardChunkManager( c , ns , _shardName ) ); { scoped_lock lk( _mutex ); // since we loaded the chunk manager unlocked, other thread may have done the same // make sure we keep the freshest config info only ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it == _chunks.end() || p->getVersion() >= it->second->getVersion() ) { _chunks[ns] = p; } ShardChunkVersion oldVersion = version; version = p->getVersion(); return oldVersion == version; } } void ShardingState::appendInfo( BSONObjBuilder& b ) { b.appendBool( "enabled" , _enabled ); if ( ! _enabled ) return; b.append( "configServer" , _configServer ); b.append( "shardName" , _shardName ); b.append( "shardHost" , _shardHost ); { BSONObjBuilder bb( b.subobjStart( "versions" ) ); scoped_lock lk(_mutex); for ( ChunkManagersMap::iterator it = _chunks.begin(); it != _chunks.end(); ++it ) { ShardChunkManagerPtr p = it->second; bb.appendTimestamp( it->first , p->getVersion() ); } bb.done(); } } bool ShardingState::needShardChunkManager( const string& ns ) const { if ( ! _enabled ) return false; if ( ! ShardedConnectionInfo::get( false ) ) return false; return true; } ShardChunkManagerPtr ShardingState::getShardChunkManager( const string& ns ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it == _chunks.end() ) { return ShardChunkManagerPtr(); } else { return it->second; } } ShardingState shardingState; // -----ShardingState END ---- // -----ShardedConnectionInfo START ---- boost::thread_specific_ptr<ShardedConnectionInfo> ShardedConnectionInfo::_tl; ShardedConnectionInfo::ShardedConnectionInfo() { _forceVersionOk = false; _id.clear(); } ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ) { ShardedConnectionInfo* info = _tl.get(); if ( ! info && create ) { log(1) << "entering shard mode for connection" << endl; info = new ShardedConnectionInfo(); _tl.reset( info ); } return info; } void ShardedConnectionInfo::reset() { _tl.reset(); } const ConfigVersion ShardedConnectionInfo::getVersion( const string& ns ) const { NSVersionMap::const_iterator it = _versions.find( ns ); if ( it != _versions.end() ) { return it->second; } else { return 0; } } void ShardedConnectionInfo::setVersion( const string& ns , const ConfigVersion& version ) { _versions[ns] = version; } void ShardedConnectionInfo::setID( const OID& id ) { _id = id; } // -----ShardedConnectionInfo END ---- unsigned long long extractVersion( BSONElement e , string& errmsg ) { if ( e.eoo() ) { errmsg = "no version"; return 0; } if ( e.isNumber() ) return (unsigned long long)e.number(); if ( e.type() == Date || e.type() == Timestamp ) return e._numberLong(); errmsg = "version is not a numeric type"; return 0; } class MongodShardCommand : public Command { public: MongodShardCommand( const char * n ) : Command( n ) { } virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } }; bool haveLocalShardingInfo( const string& ns ) { if ( ! shardingState.enabled() ) return false; if ( ! shardingState.hasVersion( ns ) ) return false; return ShardedConnectionInfo::get(false) > 0; } class UnsetShardingCommand : public MongodShardCommand { public: UnsetShardingCommand() : MongodShardCommand("unsetSharding") {} virtual void help( stringstream& help ) const { help << " example: { unsetSharding : 1 } "; } virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { ShardedConnectionInfo::reset(); return true; } } unsetShardingCommand; class SetShardVersion : public MongodShardCommand { public: SetShardVersion() : MongodShardCommand("setShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { setShardVersion : 'alleyinsider.foo' , version : 1 , configdb : '' } "; } virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); bool authoritative = cmdObj.getBoolField( "authoritative" ); string configdb = cmdObj["configdb"].valuestrsafe(); { // configdb checking if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb != shardingState.getConfigServer() ) { errmsg = "specified a different configdb!"; return false; } } else { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } shardingState.enable( configdb ); configServer.init( configdb ); } } if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } { // setting up ids if ( cmdObj["serverID"].type() != jstOID ) { // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; } else { OID clientId = cmdObj["serverID"].__oid(); if ( ! info->hasID() ) { info->setID( clientId ); } else if ( clientId != info->getID() ) { errmsg = "server id has changed!"; return 0; } } } unsigned long long version = extractVersion( cmdObj["version"] , errmsg ); if ( errmsg.size() ) { return false; } string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); if ( oldVersion > 0 && globalVersion == 0 ) { // this had been reset info->setVersion( ns , 0 ); } if ( version == 0 && globalVersion == 0 ) { // this connection is cleaning itself info->setVersion( ns , 0 ); return true; } if ( version == 0 && globalVersion > 0 ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); result.appendTimestamp( "globalVersion" , globalVersion ); result.appendTimestamp( "oldVersion" , oldVersion ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , 0 ); return true; } if ( version < oldVersion ) { errmsg = "you already have a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "oldVersion" , oldVersion ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "going to older version for global for collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ) { // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for colleciton '" << ns << "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , currVersion ); return false; } } info->setVersion( ns , version ); result.appendTimestamp( "oldVersion" , oldVersion ); result.append( "ok" , 1 ); return true; } } setShardVersionCmd; class GetShardVersion : public MongodShardCommand { public: GetShardVersion() : MongodShardCommand("getShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { getShardVersion : 'alleyinsider.foo' } "; } virtual LockType locktype() const { return NONE; } bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj["getShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns) ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns) ); else result.appendTimestamp( "mine" , 0 ); return true; } } getShardVersion; class ShardingStateCmd : public MongodShardCommand { public: ShardingStateCmd() : MongodShardCommand( "shardingState" ) {} virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { shardingState.appendInfo( result ); return true; } } shardingStateCmd; /** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , bool isWriteOp , string& errmsg ) { if ( ! shardingState.enabled() ) return true; ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ) { return true; } // TODO // all collections at some point, be sharded or not, will have a version (and a ShardChunkManager) // for now, we remove the sharding state of dropped collection // so delayed request may come in. This has to be fixed. ConfigVersion clientVersion = info->getVersion(ns); ConfigVersion version; if ( ! shardingState.hasVersion( ns , version ) && clientVersion == 0 ) { return true; } if ( version == 0 && clientVersion > 0 ) { stringstream ss; ss << "collection was dropped or this shard no longer valied version: " << version << " clientVersion: " << clientVersion; errmsg = ss.str(); return false; } if ( clientVersion >= version ) return true; if ( clientVersion == 0 ) { stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection: " << ns << " myVersion: " << version; errmsg = ss.str(); return false; } if ( version.majorVersion() == clientVersion.majorVersion() ) { // this means there was just a split // since on a split w/o a migrate this server is ok // going to accept return true; } stringstream ss; ss << "your version is too old ns: " + ns << " global: " << version << " client: " << clientVersion; errmsg = ss.str(); return false; } }
/** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , string& errmsg, ConfigVersion& received, ConfigVersion& wanted ) { if ( ! shardingState.enabled() ) return true; if ( ! isMasterNs( ns.c_str() ) ) { // right now connections to secondaries aren't versioned at all return true; } ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ) { return true; } // TODO // all collections at some point, be sharded or not, will have a version (and a ShardChunkManager) // for now, we remove the sharding state of dropped collection // so delayed request may come in. This has to be fixed. ConfigVersion clientVersion = info->getVersion(ns); ConfigVersion version; if ( ! shardingState.hasVersion( ns , version ) && ! clientVersion.isSet() ) { return true; } // The versions we're going to compare, saved for future use received = clientVersion; wanted = version; if ( ! version.isSet() && clientVersion.isSet() ) { stringstream ss; ss << "collection was dropped or this shard no longer valid version"; errmsg = ss.str(); return false; } if ( clientVersion >= version ) return true; if ( ! clientVersion.isSet() ) { stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection"; errmsg = ss.str(); return false; } if ( version.majorVersion() == clientVersion.majorVersion() ) { // this means there was just a split // since on a split w/o a migrate this server is ok // going to accept return true; } stringstream ss; ss << "your version is too old"; errmsg = ss.str(); return false; }
namespace mongo { // -----ShardingState START ---- ShardingState::ShardingState() : _enabled(false) , _mutex( "ShardingState" ){ } void ShardingState::enable( const string& server ){ _enabled = true; assert( server.size() ); if ( _configServer.size() == 0 ) _configServer = server; else { assert( server == _configServer ); } } void ShardingState::gotShardName( const string& name ){ if ( _shardName.size() == 0 ){ _shardName = name; return; } if ( _shardName == name ) return; stringstream ss; ss << "gotShardName different than what i had before " << " before [" << _shardName << "] " << " got [" << name << "] " ; uasserted( 13298 , ss.str() ); } void ShardingState::gotShardHost( string host ){ size_t slash = host.find( '/' ); if ( slash != string::npos ) host = host.substr( 0 , slash ); if ( _shardHost.size() == 0 ){ _shardHost = host; return; } if ( _shardHost == host ) return; stringstream ss; ss << "gotShardHost different than what i had before " << " before [" << _shardHost << "] " << " got [" << host << "] " ; uasserted( 13299 , ss.str() ); } bool ShardingState::hasVersion( const string& ns ){ scoped_lock lk(_mutex); NSVersionMap::const_iterator i = _versions.find(ns); return i != _versions.end(); } bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ){ scoped_lock lk(_mutex); NSVersionMap::const_iterator i = _versions.find(ns); if ( i == _versions.end() ) return false; version = i->second; return true; } const ConfigVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); NSVersionMap::const_iterator it = _versions.find( ns ); if ( it != _versions.end() ) { return it->second; } else { return 0; } } void ShardingState::setVersion( const string& ns , const ConfigVersion& version ){ scoped_lock lk(_mutex); if ( version != 0 ) { NSVersionMap::const_iterator it = _versions.find( ns ); // TODO 11-18-2010 as we're bringing chunk boundary information to mongod, it may happen that // we're setting a version for the ns that the shard knows about already (e.g because it set // it itself in a chunk migration) // eventually, the only cases to issue a setVersion would be // 1) First chunk of a collection, for version 1|0 // 2) Drop of a collection, for version 0|0 // 3) Load of the shard's chunk state, in a primary-secondary failover assert( it == _versions.end() || version >= it->second ); } _versions[ns] = version; } void ShardingState::appendInfo( BSONObjBuilder& b ){ b.appendBool( "enabled" , _enabled ); if ( ! _enabled ) return; b.append( "configServer" , _configServer ); b.append( "shardName" , _shardName ); b.append( "shardHost" , _shardHost ); { BSONObjBuilder bb( b.subobjStart( "versions" ) ); scoped_lock lk(_mutex); for ( NSVersionMap::iterator i=_versions.begin(); i!=_versions.end(); ++i ){ bb.appendTimestamp( i->first , i->second ); } bb.done(); } } bool ShardingState::needShardChunkManager( const string& ns ) const { if ( ! _enabled ) return false; if ( ! ShardedConnectionInfo::get( false ) ) return false; return true; } ShardChunkManagerPtr ShardingState::getShardChunkManager( const string& ns ){ ConfigVersion version; { // check cache scoped_lock lk( _mutex ); NSVersionMap::const_iterator it = _versions.find( ns ); if ( it == _versions.end() ) { return ShardChunkManagerPtr(); } version = it->second; // TODO SERVER-1849 pending drop work // the manager should use the cached version only if the versions match exactly ShardChunkManagerPtr p = _chunks[ns]; if ( p && p->getVersion() >= version ){ // our cached version is good, so just return return p; } } // load the chunk information for this shard from the config database // a reminder: ShardChunkManager may throw on construction const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer; ShardChunkManagerPtr p( new ShardChunkManager( c , ns , _shardName ) ); // TODO SERVER-1849 verify that the manager's version is exactly the one requested // If not, do update _chunks, but fail the request. { scoped_lock lk( _mutex ); _chunks[ns] = p; } return p; } ShardingState shardingState; // -----ShardingState END ---- // -----ShardedConnectionInfo START ---- boost::thread_specific_ptr<ShardedConnectionInfo> ShardedConnectionInfo::_tl; ShardedConnectionInfo::ShardedConnectionInfo(){ _forceVersionOk = false; _id.clear(); } ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ){ ShardedConnectionInfo* info = _tl.get(); if ( ! info && create ){ log(1) << "entering shard mode for connection" << endl; info = new ShardedConnectionInfo(); _tl.reset( info ); } return info; } void ShardedConnectionInfo::reset(){ _tl.reset(); } const ConfigVersion ShardedConnectionInfo::getVersion( const string& ns ) const { NSVersionMap::const_iterator it = _versions.find( ns ); if ( it != _versions.end() ) { return it->second; } else { return 0; } } void ShardedConnectionInfo::setVersion( const string& ns , const ConfigVersion& version ){ _versions[ns] = version; } void ShardedConnectionInfo::setID( const OID& id ){ _id = id; } // -----ShardedConnectionInfo END ---- unsigned long long extractVersion( BSONElement e , string& errmsg ){ if ( e.eoo() ){ errmsg = "no version"; return 0; } if ( e.isNumber() ) return (unsigned long long)e.number(); if ( e.type() == Date || e.type() == Timestamp ) return e._numberLong(); errmsg = "version is not a numeric type"; return 0; } class MongodShardCommand : public Command { public: MongodShardCommand( const char * n ) : Command( n ){ } virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } }; bool haveLocalShardingInfo( const string& ns ){ if ( ! shardingState.enabled() ) return false; if ( ! shardingState.hasVersion( ns ) ) return false; return ShardedConnectionInfo::get(false) > 0; } class UnsetShardingCommand : public MongodShardCommand { public: UnsetShardingCommand() : MongodShardCommand("unsetSharding"){} virtual void help( stringstream& help ) const { help << " example: { unsetSharding : 1 } "; } virtual LockType locktype() const { return NONE; } bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ ShardedConnectionInfo::reset(); return true; } } unsetShardingCommand; class SetShardVersion : public MongodShardCommand { public: SetShardVersion() : MongodShardCommand("setShardVersion"){} virtual void help( stringstream& help ) const { help << " example: { setShardVersion : 'alleyinsider.foo' , version : 1 , configdb : '' } "; } virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); bool authoritative = cmdObj.getBoolField( "authoritative" ); string configdb = cmdObj["configdb"].valuestrsafe(); { // configdb checking if ( configdb.size() == 0 ){ errmsg = "no configdb"; return false; } if ( shardingState.enabled() ){ if ( configdb != shardingState.getConfigServer() ){ errmsg = "specified a different configdb!"; return false; } } else { if ( ! authoritative ){ result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } shardingState.enable( configdb ); configServer.init( configdb ); } } if ( cmdObj["shard"].type() == String ){ shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } { // setting up ids if ( cmdObj["serverID"].type() != jstOID ){ // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; } else { OID clientId = cmdObj["serverID"].__oid(); if ( ! info->hasID() ){ info->setID( clientId ); } else if ( clientId != info->getID() ){ errmsg = "server id has changed!"; return 0; } } } unsigned long long version = extractVersion( cmdObj["version"] , errmsg ); if ( errmsg.size() ){ return false; } string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ){ errmsg = "need to speciy fully namespace"; return false; } const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); if ( oldVersion > 0 && globalVersion == 0 ){ // this had been reset info->setVersion( ns , 0 ); } if ( version == 0 && globalVersion == 0 ){ // this connection is cleaning itself info->setVersion( ns , 0 ); return true; } if ( version == 0 && globalVersion > 0 ){ if ( ! authoritative ){ result.appendBool( "need_authoritative" , true ); result.appendTimestamp( "globalVersion" , globalVersion ); result.appendTimestamp( "oldVersion" , oldVersion ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data shardingState.setVersion( ns , 0 ); info->setVersion( ns , 0 ); return true; } if ( version < oldVersion ){ errmsg = "you already have a newer version"; result.appendTimestamp( "oldVersion" , oldVersion ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( version < globalVersion ){ while ( shardingState.inCriticalMigrateSection() ){ dbtemprelease r; sleepmillis(2); log() << "waiting till out of critical section" << endl; } errmsg = "going to older version for global"; result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ){ // need authoritative for first look result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); errmsg = "first time for this ns"; return false; } result.appendTimestamp( "oldVersion" , oldVersion ); result.append( "ok" , 1 ); info->setVersion( ns , version ); shardingState.setVersion( ns , version ); // TODO SERVER-1849 pending drop work // getShardChunkManager is assuming that the setVersion above were valid // ideally, we'd call getShardChunkManager first, verify that 'version' is sound, and then update // connection and global state { dbtemprelease unlock; shardingState.getShardChunkManager( ns ); } return true; } } setShardVersionCmd; class GetShardVersion : public MongodShardCommand { public: GetShardVersion() : MongodShardCommand("getShardVersion"){} virtual void help( stringstream& help ) const { help << " example: { getShardVersion : 'alleyinsider.foo' } "; } virtual LockType locktype() const { return NONE; } bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ string ns = cmdObj["getShardVersion"].valuestrsafe(); if ( ns.size() == 0 ){ errmsg = "need to speciy fully namespace"; return false; } result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns) ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns) ); else result.appendTimestamp( "mine" , 0 ); return true; } } getShardVersion; class ShardingStateCmd : public MongodShardCommand { public: ShardingStateCmd() : MongodShardCommand( "shardingState" ){} virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ shardingState.appendInfo( result ); return true; } } shardingStateCmd; /** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , bool isWriteOp , string& errmsg ){ if ( ! shardingState.enabled() ) return true; ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ){ // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ){ return true; } ConfigVersion version; if ( ! shardingState.hasVersion( ns , version ) ){ return true; } ConfigVersion clientVersion = info->getVersion(ns); if ( version == 0 && clientVersion > 0 ){ stringstream ss; ss << "collection was dropped or this shard no longer valied version: " << version << " clientVersion: " << clientVersion; errmsg = ss.str(); return false; } if ( clientVersion >= version ) return true; if ( clientVersion == 0 ){ stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection: " << ns << " myVersion: " << version; errmsg = ss.str(); return false; } if ( isWriteOp && version.majorVersion() == clientVersion.majorVersion() ){ // this means there was just a split // since on a split w/o a migrate this server is ok // going to accept write return true; } stringstream ss; ss << "your version is too old ns: " + ns << " global: " << version << " client: " << clientVersion; errmsg = ss.str(); return false; } }
bool mergeChunks(OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg) { // Get the distributed lock string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey; auto scopedDistLock = grid.catalogManager(txn)->distLock( txn, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(scopedDistLock.getStatus()); warning() << *errMsg; return false; } ShardingState* shardingState = ShardingState::get(txn); // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState->refreshMetadataNow(txn, nss.ns(), &shardVersion); if (!status.isOK()) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy(status.reason()); warning() << *errMsg; return false; } if (epoch.isSet() && shardVersion.epoch() != epoch) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg; return false; } shared_ptr<CollectionMetadata> metadata = shardingState->getCollectionMetadata(nss.ns()); if (!metadata || metadata->getKeyPattern().isEmpty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg; return false; } dassert(metadata->getShardVersion().equals(shardVersion)); if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); std::vector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin(minKey); itChunk.setMax(minKey); itChunk.setNS(nss.ns()); itChunk.setShard(shardingState->getShardName()); while (itChunk.getMax().woCompare(maxKey) < 0 && metadata->getNextChunk(itChunk.getMax(), &itChunk)) { chunksToMerge.push_back(itChunk); } if (chunksToMerge.empty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = chunksToMerge.front().getMin(); BSONObj firstDocMax = chunksToMerge.front().getMax(); // minKey is inclusive bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey); if (!minKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } BSONObj lastDocMin = chunksToMerge.back().getMin(); BSONObj lastDocMax = chunksToMerge.back().getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0; if (!maxKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0; bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0; if (!validRangeStartKey || !validRangeEndKey) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << (!validRangeStartKey ? "starting at " + minKey.toString() : "") << (!validRangeStartKey && !validRangeEndKey ? " or " : "") << (!validRangeEndKey ? "ending at " + maxKey.toString() : ""); warning() << *errMsg; return false; } if (chunksToMerge.size() == 1) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString(minKey, maxKey); warning() << *errMsg; return false; } // Look for hole in range for (size_t i = 1; i < chunksToMerge.size(); ++i) { if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString(minKey, maxKey) << " at " << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin()); warning() << *errMsg; return false; } } // // Run apply ops command // Status applyOpsStatus = runApplyOpsCmd(txn, chunksToMerge, shardVersion, mergeVersion); if (!applyOpsStatus.isOK()) { warning() << applyOpsStatus; return false; } // // Install merged chunk metadata // { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X); shardingState->mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion); grid.catalogManager(txn)->logChange(txn, "merge", nss.ns(), mergeLogEntry); return true; }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ) { result.append( "initialized", true ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } const ConfigVersion version = ConfigVersion( extractVersion( cmdObj["version"] , errmsg ), OID() ); if ( errmsg.size() ) return false; // step 3 const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isEquivalentTo( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isEquivalentTo( version ) ) { if ( oldVersion < globalVersion ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); Lock::GlobalWrite setShardVersionLock; // TODO: can we get rid of this?? if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); return true; } if ( ! version.isSet() && globalVersion.isSet() ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; globalVersion.addToBSON( result, "beforeDrop" ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); return true; } if ( version < oldVersion ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. // TODO: Maybe a more elegant way of doing this while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section for version reset" << endl; } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } Timer relockTime; { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for collection '" << ns << "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } } if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) { log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl; } info->setVersion( ns , version ); return true; }
namespace mongo { // -----ShardingState START ---- ShardingState::ShardingState() : _enabled(false) , _mutex( "ShardingState" ), _configServerTickets( 3 /* max number of concurrent config server refresh threads */ ) { } void ShardingState::enable( const string& server ) { _enabled = true; verify( server.size() ); if ( _configServer.size() == 0 ) _configServer = server; else { verify( server == _configServer ); } } void ShardingState::gotShardName( const string& name ) { scoped_lock lk(_mutex); if ( _shardName.size() == 0 ) { // TODO SERVER-2299 verify the name is sound w.r.t IPs _shardName = name; return; } if ( _shardName == name ) return; stringstream ss; ss << "gotShardName different than what i had before " << " before [" << _shardName << "] " << " got [" << name << "] " ; msgasserted( 13298 , ss.str() ); } void ShardingState::gotShardHost( string host ) { scoped_lock lk(_mutex); size_t slash = host.find( '/' ); if ( slash != string::npos ) host = host.substr( 0 , slash ); if ( _shardHost.size() == 0 ) { _shardHost = host; return; } if ( _shardHost == host ) return; stringstream ss; ss << "gotShardHost different than what i had before " << " before [" << _shardHost << "] " << " got [" << host << "] " ; msgasserted( 13299 , ss.str() ); } void ShardingState::resetShardingState() { scoped_lock lk(_mutex); _enabled = false; _configServer.clear(); _shardName.clear(); _shardHost.clear(); _chunks.clear(); } // TODO we shouldn't need three ways for checking the version. Fix this. bool ShardingState::hasVersion( const string& ns ) { scoped_lock lk(_mutex); ChunkManagersMap::const_iterator it = _chunks.find(ns); return it != _chunks.end(); } bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ) { scoped_lock lk(_mutex); ChunkManagersMap::const_iterator it = _chunks.find(ns); if ( it == _chunks.end() ) return false; ShardChunkManagerPtr p = it->second; version = p->getVersion(); return true; } const ConfigVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it != _chunks.end() ) { ShardChunkManagerPtr p = it->second; return p->getVersion(); } else { return ConfigVersion( 0, OID() ); } } void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); verify( it != _chunks.end() ) ; ShardChunkManagerPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ShardChunkVersion( 0 , OID() ); ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) ); _chunks[ns] = cloned; } void ShardingState::undoDonateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); verify( it != _chunks.end() ) ; ShardChunkManagerPtr p( it->second->clonePlus( min , max , version ) ); _chunks[ns] = p; } void ShardingState::splitChunk( const string& ns , const BSONObj& min , const BSONObj& max , const vector<BSONObj>& splitKeys , ShardChunkVersion version ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); verify( it != _chunks.end() ) ; ShardChunkManagerPtr p( it->second->cloneSplit( min , max , splitKeys , version ) ); _chunks[ns] = p; } void ShardingState::resetVersion( const string& ns ) { scoped_lock lk( _mutex ); _chunks.erase( ns ); } bool ShardingState::trySetVersion( const string& ns , ConfigVersion& version /* IN-OUT */ ) { // Currently this function is called after a getVersion(), which is the first "check", and the assumption here // is that we don't do anything nearly as long as a remote query in a thread between then and now. // Otherwise it may be worth adding an additional check without the _configServerMutex below, since then it // would be likely that the version may have changed in the meantime without waiting for or fetching config results. // TODO: Mutex-per-namespace? LOG( 2 ) << "trying to set shard version of " << version.toString() << " for '" << ns << "'" << endl; _configServerTickets.waitForTicket(); TicketHolderReleaser needTicketFrom( &_configServerTickets ); // fast path - double-check if requested version is at the same version as this chunk manager before verifying // against config server // // This path will short-circuit the version set if another thread already managed to update the version in the // meantime. First check is from getVersion(). // // cases: // + this shard updated the version for a migrate's commit (FROM side) // a client reloaded chunk state from config and picked the newest version // + two clients reloaded // one triggered the 'slow path' (below) // when the second's request gets here, the version is already current ConfigVersion storedVersion; ShardChunkManagerPtr currManager; { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it != _chunks.end() ) currManager = it->second; if ( it != _chunks.end() && ( storedVersion = it->second->getVersion() ).isEquivalentTo( version ) ) return true; } LOG( 2 ) << "verifying cached version " << storedVersion.toString() << " and new version " << version.toString() << " for '" << ns << "'" << endl; // slow path - requested version is different than the current chunk manager's, if one exists, so must check for // newest version in the config server // // cases: // + a chunk moved TO here // (we don't bump up the version on the TO side but the commit to config does use higher version) // a client reloads from config an issued the request // + there was a take over from a secondary // the secondary had no state (managers) at all, so every client request will fall here // + a stale client request a version that's not current anymore // Can't lock default mutex while creating ShardChunkManager, b/c may have to create a new connection to myself const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer; ShardChunkManagerPtr p( new ShardChunkManager( c , ns , _shardName, currManager ) ); { scoped_lock lk( _mutex ); // since we loaded the chunk manager unlocked, other thread may have done the same // make sure we keep the freshest config info only ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it == _chunks.end() || p->getVersion() >= it->second->getVersion() ) { _chunks[ns] = p; } ShardChunkVersion oldVersion = version; version = p->getVersion(); return oldVersion.isEquivalentTo( version ); } } void ShardingState::appendInfo( BSONObjBuilder& b ) { b.appendBool( "enabled" , _enabled ); if ( ! _enabled ) return; b.append( "configServer" , _configServer ); b.append( "shardName" , _shardName ); b.append( "shardHost" , _shardHost ); { BSONObjBuilder bb( b.subobjStart( "versions" ) ); scoped_lock lk(_mutex); for ( ChunkManagersMap::iterator it = _chunks.begin(); it != _chunks.end(); ++it ) { ShardChunkManagerPtr p = it->second; bb.appendTimestamp( it->first , p->getVersion().toLong() ); } bb.done(); } } bool ShardingState::needShardChunkManager( const string& ns ) const { if ( ! _enabled ) return false; if ( ! ShardedConnectionInfo::get( false ) ) return false; return true; } ShardChunkManagerPtr ShardingState::getShardChunkManager( const string& ns ) { scoped_lock lk( _mutex ); ChunkManagersMap::const_iterator it = _chunks.find( ns ); if ( it == _chunks.end() ) { return ShardChunkManagerPtr(); } else { return it->second; } } ShardingState shardingState; // -----ShardingState END ---- // -----ShardedConnectionInfo START ---- boost::thread_specific_ptr<ShardedConnectionInfo> ShardedConnectionInfo::_tl; ShardedConnectionInfo::ShardedConnectionInfo() { _forceVersionOk = false; _id.clear(); } ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ) { ShardedConnectionInfo* info = _tl.get(); if ( ! info && create ) { LOG(1) << "entering shard mode for connection" << endl; info = new ShardedConnectionInfo(); _tl.reset( info ); } return info; } void ShardedConnectionInfo::reset() { _tl.reset(); } const ConfigVersion ShardedConnectionInfo::getVersion( const string& ns ) const { NSVersionMap::const_iterator it = _versions.find( ns ); if ( it != _versions.end() ) { return it->second; } else { return ConfigVersion( 0, OID() ); } } void ShardedConnectionInfo::setVersion( const string& ns , const ConfigVersion& version ) { _versions[ns] = version; } void ShardedConnectionInfo::addHook() { static bool done = false; if (!done) { LOG(1) << "adding sharding hook" << endl; pool.addHook(new ShardingConnectionHook(false)); shardConnectionPool.addHook(new ShardingConnectionHook(true)); done = true; } } void ShardedConnectionInfo::setID( const OID& id ) { _id = id; } // -----ShardedConnectionInfo END ---- unsigned long long extractVersion( BSONElement e , string& errmsg ) { if ( e.eoo() ) { errmsg = "no version"; return 0; } if ( e.isNumber() ) return (unsigned long long)e.number(); if ( e.type() == Date || e.type() == Timestamp ) return e._numberLong(); errmsg = "version is not a numeric type"; return 0; } class MongodShardCommand : public Command { public: MongodShardCommand( const char * n ) : Command( n ) { } virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } }; bool haveLocalShardingInfo( const string& ns ) { if ( ! shardingState.enabled() ) return false; if ( ! shardingState.hasVersion( ns ) ) return false; return ShardedConnectionInfo::get(false) > 0; } class UnsetShardingCommand : public MongodShardCommand { public: UnsetShardingCommand() : MongodShardCommand("unsetSharding") {} virtual void help( stringstream& help ) const { help << " example: { unsetSharding : 1 } "; } virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { ShardedConnectionInfo::reset(); return true; } } unsetShardingCommand; class SetShardVersion : public MongodShardCommand { public: SetShardVersion() : MongodShardCommand("setShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { setShardVersion : 'alleyinsider.foo' , version : 1 , configdb : '' } "; } virtual bool slaveOk() const { return true; } virtual LockType locktype() const { return NONE; } bool checkConfigOrInit( const string& configdb , bool authoritative , string& errmsg , BSONObjBuilder& result , bool locked=false ) const { if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb == shardingState.getConfigServer() ) return true; result.append( "configdb" , BSON( "stored" << shardingState.getConfigServer() << "given" << configdb ) ); errmsg = str::stream() << "mongos specified a different config database string : " << "stored : " << shardingState.getConfigServer() << " vs given : " << configdb; return false; } if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } if ( locked ) { ShardedConnectionInfo::addHook(); shardingState.enable( configdb ); configServer.init( configdb ); return true; } Lock::GlobalWrite lk; return checkConfigOrInit( configdb , authoritative , errmsg , result , true ); } bool checkMongosID( ShardedConnectionInfo* info, const BSONElement& id, string& errmsg ) { if ( id.type() != jstOID ) { if ( ! info->hasID() ) { warning() << "bad serverID set in setShardVersion and none in info: " << id << endl; } // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; return true; } OID clientId = id.__oid(); if ( ! info->hasID() ) { info->setID( clientId ); return true; } if ( clientId != info->getID() ) { errmsg = "server id has changed!"; return false; } return true; } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ) { result.append( "initialized", true ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } const ConfigVersion version = ConfigVersion( extractVersion( cmdObj["version"] , errmsg ), OID() ); if ( errmsg.size() ) return false; // step 3 const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isEquivalentTo( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isEquivalentTo( version ) ) { if ( oldVersion < globalVersion ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); Lock::GlobalWrite setShardVersionLock; // TODO: can we get rid of this?? if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); return true; } if ( ! version.isSet() && globalVersion.isSet() ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; globalVersion.addToBSON( result, "beforeDrop" ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , ShardChunkVersion( 0, OID() ) ); return true; } if ( version < oldVersion ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. // TODO: Maybe a more elegant way of doing this while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section for version reset" << endl; } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } Timer relockTime; { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for collection '" << ns << "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } } if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) { log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl; } info->setVersion( ns , version ); return true; } } setShardVersionCmd; class GetShardVersion : public MongodShardCommand { public: GetShardVersion() : MongodShardCommand("getShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { getShardVersion : 'alleyinsider.foo' } "; } virtual LockType locktype() const { return NONE; } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj["getShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify full namespace"; return false; } result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns).toLong() ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); result.appendBool( "inShardedMode" , info != 0 ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns).toLong() ); else result.appendTimestamp( "mine" , 0 ); return true; } } getShardVersion; class ShardingStateCmd : public MongodShardCommand { public: ShardingStateCmd() : MongodShardCommand( "shardingState" ) {} virtual LockType locktype() const { return WRITE; // TODO: figure out how to make this not need to lock } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { shardingState.appendInfo( result ); return true; } } shardingStateCmd; /** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , string& errmsg, ConfigVersion& received, ConfigVersion& wanted ) { if ( ! shardingState.enabled() ) return true; if ( ! isMasterNs( ns.c_str() ) ) { // right now connections to secondaries aren't versioned at all return true; } ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ) { return true; } // TODO // all collections at some point, be sharded or not, will have a version (and a ShardChunkManager) // for now, we remove the sharding state of dropped collection // so delayed request may come in. This has to be fixed. ConfigVersion clientVersion = info->getVersion(ns); ConfigVersion version; if ( ! shardingState.hasVersion( ns , version ) && ! clientVersion.isSet() ) { return true; } // The versions we're going to compare, saved for future use received = clientVersion; wanted = version; if ( ! version.isSet() && clientVersion.isSet() ) { stringstream ss; ss << "collection was dropped or this shard no longer valid version"; errmsg = ss.str(); return false; } if ( clientVersion >= version ) return true; if ( ! clientVersion.isSet() ) { stringstream ss; ss << "client in sharded mode, but doesn't have version set for this collection"; errmsg = ss.str(); return false; } if ( version.majorVersion() == clientVersion.majorVersion() ) { // this means there was just a split // since on a split w/o a migrate this server is ok // going to accept return true; } stringstream ss; ss << "your version is too old"; errmsg = ss.str(); return false; } void ShardingConnectionHook::onHandedOut( DBClientBase * conn ) { // no-op for mongod } }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to speciy namespace"; return false; } const ConfigVersion version = extractVersion( cmdObj["version"] , errmsg ); if ( errmsg.size() ) return false; // step 3 const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); result.appendTimestamp( "oldVersion" , oldVersion ); if ( globalVersion > 0 && version > 0 ) { // this means there is no reset going on an either side // so its safe to make some assuptions if ( version == globalVersion ) { // mongos and mongod agree! if ( oldVersion != version ) { assert( oldVersion < globalVersion ); info->setVersion( ns , version ); } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); dblock setShardVersionLock; // TODO: can we get rid of this?? if ( oldVersion > 0 && globalVersion == 0 ) { // this had been reset info->setVersion( ns , 0 ); } if ( version == 0 && globalVersion == 0 ) { // this connection is cleaning itself info->setVersion( ns , 0 ); return true; } if ( version == 0 && globalVersion > 0 ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); result.appendTimestamp( "globalVersion" , globalVersion ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , 0 ); return true; } if ( version < oldVersion ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); result.appendBool( "reloadConfig" , true ); return false; } if ( globalVersion == 0 && ! authoritative ) { // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } Timer relockTime; { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for colleciton '" << ns << "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , currVersion ); return false; } } if ( relockTime.millis() >= ( cmdLine.slowMS - 10 ) ) { log() << "setShardVersion - relocking slow: " << relockTime.millis() << endl; } info->setVersion( ns , version ); return true; }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){ result.append( "initialized", true ); // Send back wire version to let mongos know what protocol we can speak result.append( "minWireVersion", minWireVersion ); result.append( "maxWireVersion", maxWireVersion ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } if( ! ChunkVersion::canParseBSON( cmdObj, "version" ) ){ errmsg = "need to specify version"; return false; } const ChunkVersion version = ChunkVersion::fromBSON( cmdObj, "version" ); // step 3 const ChunkVersion oldVersion = info->getVersion(ns); const ChunkVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isWriteCompatibleWith( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isWriteCompatibleWith( version ) ) { if ( oldVersion < globalVersion && oldVersion.hasCompatibleEpoch(globalVersion) ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } // Cases below all either return OR fall-through to remote metadata reload. if ( version.isSet() || !globalVersion.isSet() ) { // Not Dropping // TODO: Refactor all of this if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } // TODO: Refactor all of this if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } // Fall through to metadata reload below } else { // Dropping if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } // Fall through to metadata reload below } ChunkVersion currVersion; Status status = shardingState.refreshMetadataIfNeeded( ns, version, &currVersion ); if (!status.isOK()) { // The reload itself was interrupted or confused here errmsg = str::stream() << "could not refresh metadata for " << ns << " with requested shard version " << version.toString() << ", stored shard version is " << currVersion.toString() << causedBy( status.reason() ); warning() << errmsg << endl; result.append( "ns" , ns ); version.addToBSON( result, "version" ); currVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig", true ); return false; } else if ( !version.isWriteCompatibleWith( currVersion ) ) { // We reloaded a version that doesn't match the version mongos was trying to // set. errmsg = str::stream() << "requested shard version differs from" << " config shard version for " << ns << ", requested version is " << version.toString() << " but found version " << currVersion.toString(); OCCASIONALLY warning() << errmsg << endl; // WARNING: the exact fields below are important for compatibility with mongos // version reload. result.append( "ns" , ns ); currVersion.addToBSON( result, "globalVersion" ); // If this was a reset of a collection or the last chunk moved out, inform mongos to // do a full reload. if (currVersion.epoch() != version.epoch() || !currVersion.isSet() ) { result.appendBool( "reloadConfig", true ); // Zero-version also needed to trigger full mongos reload, sadly // TODO: Make this saner, and less impactful (full reload on last chunk is bad) ChunkVersion( 0, 0, OID() ).addToBSON( result, "version" ); // For debugging version.addToBSON( result, "origVersion" ); } else { version.addToBSON( result, "version" ); } return false; } info->setVersion( ns , version ); return true; }
namespace mongo { // -----ShardingState START ---- ShardingState::ShardingState() : _enabled(false) , _mutex( "ShardingState" ), _configServerTickets( 3 /* max number of concurrent config server refresh threads */ ) { } void ShardingState::enable( const string& server ) { scoped_lock lk(_mutex); _enabled = true; verify( server.size() ); if ( _configServer.size() == 0 ) _configServer = server; else { verify( server == _configServer ); } } void ShardingState::initialize(const string& server) { ShardedConnectionInfo::addHook(); shardingState.enable(server); configServer.init(server); } bool ShardingState::setShardName( const string& name ) { scoped_lock lk(_mutex); if ( _shardName.size() == 0 ) { // TODO SERVER-2299 remotely verify the name is sound w.r.t IPs _shardName = name; string clientAddr = cc().clientAddress(true); log() << "remote client " << clientAddr << " initialized this host as shard " << name; return true; } if ( _shardName == name ) return true; string clientAddr = cc().clientAddress(true); warning() << "remote client " << clientAddr << " tried to initialize this host as shard " << name << ", but shard name was previously initialized as " << _shardName; return false; } void ShardingState::gotShardName( const string& name ) { if ( setShardName( name ) ) return; string clientAddr = cc().clientAddress(true); stringstream ss; // Same error as above, to match for reporting ss << "remote client " << clientAddr << " tried to initialize this host as shard " << name << ", but shard name was previously initialized as " << _shardName; msgasserted( 13298 , ss.str() ); } void ShardingState::resetShardingState() { scoped_lock lk(_mutex); _enabled = false; _configServer.clear(); _shardName.clear(); _collMetadata.clear(); } // TODO we shouldn't need three ways for checking the version. Fix this. bool ShardingState::hasVersion( const string& ns ) { scoped_lock lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find(ns); return it != _collMetadata.end(); } bool ShardingState::hasVersion( const string& ns , ChunkVersion& version ) { scoped_lock lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find(ns); if ( it == _collMetadata.end() ) return false; CollectionMetadataPtr p = it->second; version = p->getShardVersion(); return true; } const ChunkVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) { CollectionMetadataPtr p = it->second; return p->getShardVersion(); } else { return ChunkVersion( 0, OID() ); } } void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ) ; CollectionMetadataPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ChunkVersion( 0, 0, p->getCollVersion().epoch() ); ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); string errMsg; CollectionMetadataPtr cloned( p->cloneMigrate( chunk, version, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 16855, errMsg, NULL != cloned.get() ); // TODO: a bit dangerous to have two different zero-version states - no-metadata and // no-version _collMetadata[ns] = cloned; } void ShardingState::undoDonateChunk( const string& ns, CollectionMetadataPtr prevMetadata ) { scoped_lock lk( _mutex ); log() << "ShardingState::undoDonateChunk acquired _mutex" << endl; CollectionMetadataMap::iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ); it->second = prevMetadata; } bool ShardingState::notePending( const string& ns, const BSONObj& min, const BSONObj& max, const OID& epoch, string* errMsg ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it == _collMetadata.end() ) { *errMsg = str::stream() << "could not note chunk " << "[" << min << "," << max << ")" << " as pending because the local metadata for " << ns << " has changed"; return false; } CollectionMetadataPtr metadata = it->second; // This can currently happen because drops aren't synchronized with in-migrations // The idea for checking this here is that in the future we shouldn't have this problem if ( metadata->getCollVersion().epoch() != epoch ) { *errMsg = str::stream() << "could not note chunk " << "[" << min << "," << max << ")" << " as pending because the epoch for " << ns << " has changed from " << epoch << " to " << metadata->getCollVersion().epoch(); return false; } ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); CollectionMetadataPtr cloned( metadata->clonePlusPending( chunk, errMsg ) ); if ( !cloned ) return false; _collMetadata[ns] = cloned; return true; } bool ShardingState::forgetPending( const string& ns, const BSONObj& min, const BSONObj& max, const OID& epoch, string* errMsg ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it == _collMetadata.end() ) { *errMsg = str::stream() << "no need to forget pending chunk " << "[" << min << "," << max << ")" << " because the local metadata for " << ns << " has changed"; return false; } CollectionMetadataPtr metadata = it->second; // This can currently happen because drops aren't synchronized with in-migrations // The idea for checking this here is that in the future we shouldn't have this problem if ( metadata->getCollVersion().epoch() != epoch ) { *errMsg = str::stream() << "no need to forget pending chunk " << "[" << min << "," << max << ")" << " because the epoch for " << ns << " has changed from " << epoch << " to " << metadata->getCollVersion().epoch(); return false; } ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); CollectionMetadataPtr cloned( metadata->cloneMinusPending( chunk, errMsg ) ); if ( !cloned ) return false; _collMetadata[ns] = cloned; return true; } void ShardingState::splitChunk( const string& ns, const BSONObj& min, const BSONObj& max, const vector<BSONObj>& splitKeys, ChunkVersion version ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ) ; ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); string errMsg; CollectionMetadataPtr cloned( it->second->cloneSplit( chunk, splitKeys, version, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 16857, errMsg, NULL != cloned.get() ); _collMetadata[ns] = cloned; } void ShardingState::mergeChunks( const string& ns, const BSONObj& minKey, const BSONObj& maxKey, ChunkVersion mergedVersion ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ); string errMsg; CollectionMetadataPtr cloned( it->second->cloneMerge( minKey, maxKey, mergedVersion, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 17004, errMsg, NULL != cloned.get() ); _collMetadata[ns] = cloned; } void ShardingState::resetMetadata( const string& ns ) { scoped_lock lk( _mutex ); warning() << "resetting metadata for " << ns << ", this should only be used in testing" << endl; _collMetadata.erase( ns ); } Status ShardingState::refreshMetadataIfNeeded( const string& ns, const ChunkVersion& reqShardVersion, ChunkVersion* latestShardVersion ) { // The _configServerTickets serializes this process such that only a small number of threads // can try to refresh at the same time. LOG( 2 ) << "metadata refresh requested for " << ns << " at shard version " << reqShardVersion << endl; // // Queuing of refresh requests starts here when remote reload is needed. This may take time. // TODO: Explicitly expose the queuing discipline. // _configServerTickets.waitForTicket(); TicketHolderReleaser needTicketFrom( &_configServerTickets ); // // Fast path - check if the requested version is at a higher version than the current // metadata version or a different epoch before verifying against config server. // CollectionMetadataPtr storedMetadata; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) storedMetadata = it->second; } ChunkVersion storedShardVersion; if ( storedMetadata ) storedShardVersion = storedMetadata->getShardVersion(); *latestShardVersion = storedShardVersion; if ( storedShardVersion >= reqShardVersion && storedShardVersion.epoch() == reqShardVersion.epoch() ) { // Don't need to remotely reload if we're in the same epoch with a >= version return Status::OK(); } // // Slow path - remotely reload // // Cases: // A) Initial config load and/or secondary take-over. // B) Migration TO this shard finished, notified by mongos. // C) Dropping a collection, notified (currently) by mongos. // D) Stale client wants to reload metadata with a different *epoch*, so we aren't sure. if ( storedShardVersion.epoch() != reqShardVersion.epoch() ) { // Need to remotely reload if our epochs aren't the same, to verify LOG( 1 ) << "metadata change requested for " << ns << ", from shard version " << storedShardVersion << " to " << reqShardVersion << ", need to verify with config server" << endl; } else { // Need to remotely reload since our epochs aren't the same but our version is greater LOG( 1 ) << "metadata version update requested for " << ns << ", from shard version " << storedShardVersion << " to " << reqShardVersion << ", need to verify with config server" << endl; } return doRefreshMetadata( ns, reqShardVersion, true, latestShardVersion ); } Status ShardingState::refreshMetadataNow( const string& ns, ChunkVersion* latestShardVersion ) { return doRefreshMetadata( ns, ChunkVersion( 0, 0, OID() ), false, latestShardVersion ); } Status ShardingState::doRefreshMetadata( const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion ) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // CollectionMetadataPtr beforeMetadata; string shardName; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) beforeMetadata = it->second; shardName = _shardName; } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if ( beforeMetadata ) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // We can't reload without a shard name. Must check here before loading, since shard name // may have changed if we checked it earlier and released the _mutex. if ( shardName.empty() ) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; LOG( 0 ) << errMsg << endl; return Status( ErrorCodes::IllegalOperation, errMsg ); } // // Determine whether we need to diff or fully reload // bool fullReload = false; if ( !beforeMetadata ) { // We don't have any metadata to reload from fullReload = true; } else if ( useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch() ) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG( 0 ) << "remotely refreshing metadata for " << ns << ( useRequestedVersion ? string( " with requested shard version " ) + reqShardVersion.toString() : "" ) << ( fullReload ? ", current shard version is " : " based on current shard version " ) << beforeShardVersion << ", current metadata version is " << beforeCollVersion << endl; string errMsg; ConnectionString configServerLoc = ConnectionString::parse( _configServer, errMsg ); MetadataLoader mdLoader( configServerLoc ); CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); CollectionMetadataPtr remoteMetadata( remoteMetadataRaw ); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata( ns, shardName, ( fullReload ? NULL : beforeMetadata.get() ), remoteMetadataRaw ); long long refreshMillis = refreshTimer.millis(); if ( status.code() == ErrorCodes::NamespaceNotFound ) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if ( !status.isOK() ) { warning() << "could not remotely refresh metadata for " << ns << causedBy( status.reason() ) << endl; return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if ( remoteMetadata ) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // CollectionMetadataPtr afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // DBLock needed since we're now potentially changing the metadata, and don't want // reads/writes to be ongoing. Lock::DBWrite writeLk( ns ); // // Get the metadata now that the load has completed // scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) afterMetadata = it->second; if ( afterMetadata ) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks( afterMetadata.get(), remoteMetadataRaw ); if ( !status.isOK() ) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy( status.reason() ) << endl; return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion ); if ( choice == ChunkVersion::VersionChoice_Remote ) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if ( !afterCollVersion.epoch().isSet() ) { // First metadata load installType = InstallType_New; dassert( it == _collMetadata.end() ); _collMetadata.insert( make_pair( ns, remoteMetadata ) ); } else if ( remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch() ) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else if ( remoteCollVersion.epoch().isSet() ) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else { dassert( !remoteCollVersion.epoch().isSet() ); // Drop detected installType = InstallType_Drop; _collMetadata.erase( it ); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string versionMsg = str::stream() << " (loaded metadata version : " << remoteCollVersion.toString() << ( beforeCollVersion.epoch() == afterCollVersion.epoch() ? string( ", stored version : " ) + afterCollVersion.toString() : string( ", stored versions : " ) + beforeCollVersion.toString() + " / " + afterCollVersion.toString() ) << ", took " << refreshMillis << "ms)"; if ( choice == ChunkVersion::VersionChoice_Unknown ) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << versionMsg; warning() << errMsg << endl; return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } if ( choice == ChunkVersion::VersionChoice_Local ) { LOG( 0 ) << "newer metadata not found for " << ns << versionMsg << endl; return Status::OK(); } dassert( choice == ChunkVersion::VersionChoice_Remote ); switch( installType ) { case InstallType_New: LOG( 0 ) << "loaded new metadata for " << ns << versionMsg << endl; break; case InstallType_Update: LOG( 0 ) << "loaded newer metadata for " << ns << versionMsg << endl; break; case InstallType_Replace: LOG( 0 ) << "replacing metadata for " << ns << versionMsg << endl; break; case InstallType_Drop: LOG( 0 ) << "dropping metadata for " << ns << versionMsg << endl; break; default: verify( false ); break; } return Status::OK(); } void ShardingState::appendInfo( BSONObjBuilder& b ) { b.appendBool( "enabled" , _enabled ); if ( ! _enabled ) return; b.append( "configServer" , _configServer ); b.append( "shardName" , _shardName ); { BSONObjBuilder bb( b.subobjStart( "versions" ) ); scoped_lock lk(_mutex); for ( CollectionMetadataMap::iterator it = _collMetadata.begin(); it != _collMetadata.end(); ++it ) { CollectionMetadataPtr p = it->second; bb.appendTimestamp( it->first , p->getShardVersion().toLong() ); } bb.done(); } } bool ShardingState::needCollectionMetadata( const string& ns ) const { if ( ! _enabled ) return false; if ( ! ShardedConnectionInfo::get( false ) ) return false; return true; } CollectionMetadataPtr ShardingState::getCollectionMetadata( const string& ns ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it == _collMetadata.end() ) { return CollectionMetadataPtr(); } else { return it->second; } } ShardingState shardingState; // -----ShardingState END ---- // -----ShardedConnectionInfo START ---- boost::thread_specific_ptr<ShardedConnectionInfo> ShardedConnectionInfo::_tl; ShardedConnectionInfo::ShardedConnectionInfo() { _forceVersionOk = false; _id.clear(); } ShardedConnectionInfo* ShardedConnectionInfo::get( bool create ) { ShardedConnectionInfo* info = _tl.get(); if ( ! info && create ) { LOG(1) << "entering shard mode for connection" << endl; info = new ShardedConnectionInfo(); _tl.reset( info ); } return info; } void ShardedConnectionInfo::reset() { _tl.reset(); } const ChunkVersion ShardedConnectionInfo::getVersion( const string& ns ) const { NSVersionMap::const_iterator it = _versions.find( ns ); if ( it != _versions.end() ) { return it->second; } else { return ChunkVersion( 0, OID() ); } } void ShardedConnectionInfo::setVersion( const string& ns , const ChunkVersion& version ) { _versions[ns] = version; } void ShardedConnectionInfo::addHook() { static mongo::mutex lock("ShardedConnectionInfo::addHook mutex"); static bool done = false; scoped_lock lk(lock); if (!done) { log() << "first cluster operation detected, adding sharding hook to enable versioning " "and authentication to remote servers" << endl; pool.addHook(new ShardingConnectionHook(false)); shardConnectionPool.addHook(new ShardingConnectionHook(true)); done = true; } } void ShardedConnectionInfo::setID( const OID& id ) { _id = id; } class MongodShardCommand : public Command { public: MongodShardCommand( const char * n ) : Command( n ) { } virtual bool slaveOk() const { return false; } virtual bool adminOnly() const { return true; } }; bool haveLocalShardingInfo( const string& ns ) { if ( ! shardingState.enabled() ) return false; if ( ! shardingState.hasVersion( ns ) ) return false; return ShardedConnectionInfo::get(false) > 0; } class UnsetShardingCommand : public MongodShardCommand { public: UnsetShardingCommand() : MongodShardCommand("unsetSharding") {} virtual void help( stringstream& help ) const { help << "internal"; } virtual LockType locktype() const { return NONE; } virtual bool slaveOk() const { return true; } virtual void addRequiredPrivileges(const std::string& dbname, const BSONObj& cmdObj, std::vector<Privilege>* out) { ActionSet actions; actions.addAction(ActionType::internal); out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { ShardedConnectionInfo::reset(); return true; } } unsetShardingCommand; class SetShardVersion : public MongodShardCommand { public: SetShardVersion() : MongodShardCommand("setShardVersion") {} virtual void help( stringstream& help ) const { help << "internal"; } virtual bool slaveOk() const { return true; } virtual LockType locktype() const { return NONE; } virtual void addRequiredPrivileges(const std::string& dbname, const BSONObj& cmdObj, std::vector<Privilege>* out) { ActionSet actions; actions.addAction(ActionType::internal); out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); } bool checkConfigOrInit( const string& configdb , bool authoritative , string& errmsg , BSONObjBuilder& result , bool locked=false ) const { if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb == shardingState.getConfigServer() ) return true; result.append( "configdb" , BSON( "stored" << shardingState.getConfigServer() << "given" << configdb ) ); errmsg = str::stream() << "mongos specified a different config database string : " << "stored : " << shardingState.getConfigServer() << " vs given : " << configdb; return false; } if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } if ( locked ) { ShardingState::initialize(configdb); return true; } Lock::GlobalWrite lk; return checkConfigOrInit( configdb , authoritative , errmsg , result , true ); } bool checkMongosID( ShardedConnectionInfo* info, const BSONElement& id, string& errmsg ) { if ( id.type() != jstOID ) { if ( ! info->hasID() ) { warning() << "bad serverID set in setShardVersion and none in info: " << id << endl; } // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; return true; } OID clientId = id.__oid(); if ( ! info->hasID() ) { info->setID( clientId ); return true; } if ( clientId != info->getID() ) { errmsg = "server id has changed!"; return false; } return true; } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){ result.append( "initialized", true ); // Send back wire version to let mongos know what protocol we can speak result.append( "minWireVersion", minWireVersion ); result.append( "maxWireVersion", maxWireVersion ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } if( ! ChunkVersion::canParseBSON( cmdObj, "version" ) ){ errmsg = "need to specify version"; return false; } const ChunkVersion version = ChunkVersion::fromBSON( cmdObj, "version" ); // step 3 const ChunkVersion oldVersion = info->getVersion(ns); const ChunkVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isWriteCompatibleWith( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isWriteCompatibleWith( version ) ) { if ( oldVersion < globalVersion && oldVersion.hasCompatibleEpoch(globalVersion) ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } // Cases below all either return OR fall-through to remote metadata reload. if ( version.isSet() || !globalVersion.isSet() ) { // Not Dropping // TODO: Refactor all of this if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } // TODO: Refactor all of this if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } // Fall through to metadata reload below } else { // Dropping if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } // Fall through to metadata reload below } ChunkVersion currVersion; Status status = shardingState.refreshMetadataIfNeeded( ns, version, &currVersion ); if (!status.isOK()) { // The reload itself was interrupted or confused here errmsg = str::stream() << "could not refresh metadata for " << ns << " with requested shard version " << version.toString() << ", stored shard version is " << currVersion.toString() << causedBy( status.reason() ); warning() << errmsg << endl; result.append( "ns" , ns ); version.addToBSON( result, "version" ); currVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig", true ); return false; } else if ( !version.isWriteCompatibleWith( currVersion ) ) { // We reloaded a version that doesn't match the version mongos was trying to // set. errmsg = str::stream() << "requested shard version differs from" << " config shard version for " << ns << ", requested version is " << version.toString() << " but found version " << currVersion.toString(); OCCASIONALLY warning() << errmsg << endl; // WARNING: the exact fields below are important for compatibility with mongos // version reload. result.append( "ns" , ns ); currVersion.addToBSON( result, "globalVersion" ); // If this was a reset of a collection or the last chunk moved out, inform mongos to // do a full reload. if (currVersion.epoch() != version.epoch() || !currVersion.isSet() ) { result.appendBool( "reloadConfig", true ); // Zero-version also needed to trigger full mongos reload, sadly // TODO: Make this saner, and less impactful (full reload on last chunk is bad) ChunkVersion( 0, 0, OID() ).addToBSON( result, "version" ); // For debugging version.addToBSON( result, "origVersion" ); } else { version.addToBSON( result, "version" ); } return false; } info->setVersion( ns , version ); return true; } } setShardVersionCmd; class GetShardVersion : public MongodShardCommand { public: GetShardVersion() : MongodShardCommand("getShardVersion") {} virtual void help( stringstream& help ) const { help << " example: { getShardVersion : 'alleyinsider.foo' } "; } virtual LockType locktype() const { return NONE; } virtual Status checkAuthForCommand(ClientBasic* client, const std::string& dbname, const BSONObj& cmdObj) { if (!client->getAuthorizationSession()->isAuthorizedForActionsOnResource( ResourcePattern::forExactNamespace(NamespaceString(parseNs(dbname, cmdObj))), ActionType::getShardVersion)) { return Status(ErrorCodes::Unauthorized, "Unauthorized"); } return Status::OK(); } virtual std::string parseNs(const std::string& dbname, const BSONObj& cmdObj) const { return parseNsFullyQualified(dbname, cmdObj); } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { string ns = cmdObj["getShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify full namespace"; return false; } result.append( "configServer" , shardingState.getConfigServer() ); result.appendTimestamp( "global" , shardingState.getVersion(ns).toLong() ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); result.appendBool( "inShardedMode" , info != 0 ); if ( info ) result.appendTimestamp( "mine" , info->getVersion(ns).toLong() ); else result.appendTimestamp( "mine" , 0 ); if ( cmdObj["fullMetadata"].trueValue() ) { CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( ns ); if ( metadata ) result.append( "metadata", metadata->toBSON() ); else result.append( "metadata", BSONObj() ); } return true; } } getShardVersion; class ShardingStateCmd : public MongodShardCommand { public: ShardingStateCmd() : MongodShardCommand( "shardingState" ) {} virtual LockType locktype() const { return WRITE; } // TODO: figure out how to make this not need to lock virtual void addRequiredPrivileges(const std::string& dbname, const BSONObj& cmdObj, std::vector<Privilege>* out) { ActionSet actions; actions.addAction(ActionType::shardingState); out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); } bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { shardingState.appendInfo( result ); return true; } } shardingStateCmd; /** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , string& errmsg, ChunkVersion& received, ChunkVersion& wanted ) { if ( ! shardingState.enabled() ) return true; if ( ! isMasterNs( ns.c_str() ) ) { // right now connections to secondaries aren't versioned at all return true; } ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ) { return true; } // TODO : all collections at some point, be sharded or not, will have a version // (and a CollectionMetadata) received = info->getVersion( ns ); wanted = shardingState.getVersion( ns ); if( received.isWriteCompatibleWith( wanted ) ) return true; // // Figure out exactly why not compatible, send appropriate error message // The versions themselves are returned in the error, so not needed in messages here // // Check epoch first, to send more meaningful message, since other parameters probably // won't match either if( ! wanted.hasCompatibleEpoch( received ) ){ errmsg = str::stream() << "version epoch mismatch detected for " << ns << ", " << "the collection may have been dropped and recreated"; return false; } if( ! wanted.isSet() && received.isSet() ){ errmsg = str::stream() << "this shard no longer contains chunks for " << ns << ", " << "the collection may have been dropped"; return false; } if( wanted.isSet() && ! received.isSet() ){ errmsg = str::stream() << "this shard contains versioned chunks for " << ns << ", " << "but no version set in request"; return false; } if( wanted.majorVersion() != received.majorVersion() ){ // // Could be > or < - wanted is > if this is the source of a migration, // wanted < if this is the target of a migration // errmsg = str::stream() << "version mismatch detected for " << ns << ", " << "stored major version " << wanted.majorVersion() << " does not match received " << received.majorVersion(); return false; } // Those are all the reasons the versions can mismatch verify( false ); return false; } void usingAShardConnection( const string& addr ) { } }
/** * @ return true if not in sharded mode or if version for this client is ok */ bool shardVersionOk( const string& ns , string& errmsg, ChunkVersion& received, ChunkVersion& wanted ) { if ( ! shardingState.enabled() ) return true; if ( ! isMasterNs( ns.c_str() ) ) { // right now connections to secondaries aren't versioned at all return true; } ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); if ( ! info ) { // this means the client has nothing sharded // so this allows direct connections to do whatever they want // which i think is the correct behavior return true; } if ( info->inForceVersionOkMode() ) { return true; } // TODO : all collections at some point, be sharded or not, will have a version // (and a CollectionMetadata) received = info->getVersion( ns ); wanted = shardingState.getVersion( ns ); if( received.isWriteCompatibleWith( wanted ) ) return true; // // Figure out exactly why not compatible, send appropriate error message // The versions themselves are returned in the error, so not needed in messages here // // Check epoch first, to send more meaningful message, since other parameters probably // won't match either if( ! wanted.hasCompatibleEpoch( received ) ){ errmsg = str::stream() << "version epoch mismatch detected for " << ns << ", " << "the collection may have been dropped and recreated"; return false; } if( ! wanted.isSet() && received.isSet() ){ errmsg = str::stream() << "this shard no longer contains chunks for " << ns << ", " << "the collection may have been dropped"; return false; } if( wanted.isSet() && ! received.isSet() ){ errmsg = str::stream() << "this shard contains versioned chunks for " << ns << ", " << "but no version set in request"; return false; } if( wanted.majorVersion() != received.majorVersion() ){ // // Could be > or < - wanted is > if this is the source of a migration, // wanted < if this is the target of a migration // errmsg = str::stream() << "version mismatch detected for " << ns << ", " << "stored major version " << wanted.majorVersion() << " does not match received " << received.majorVersion(); return false; } // Those are all the reasons the versions can mismatch verify( false ); return false; }