static void buildStaleError( const ChunkVersion& shardVersionRecvd, const ChunkVersion& shardVersionWanted, BatchedErrorDetail* error ) { // Write stale error to results error->setErrCode( ErrorCodes::StaleShardVersion ); BSONObjBuilder infoB; shardVersionWanted.addToBSON( infoB, "vWanted" ); error->setErrInfo( infoB.obj() ); string errMsg = stream() << "stale shard version detected before write, received " << shardVersionRecvd.toString() << " but local version is " << shardVersionWanted.toString(); error->setErrMessage( errMsg ); }
void ShardingTestFixture::expectSetShardVersion(const HostAndPort& expectedHost, const ShardType& expectedShard, const NamespaceString& expectedNs, const ChunkVersion& expectedChunkVersion) { onCommand([&](const RemoteCommandRequest& request) { ASSERT_EQ(expectedHost, request.target); ASSERT_EQUALS(rpc::makeEmptyMetadata(), request.metadata); SetShardVersionRequest ssv = assertGet(SetShardVersionRequest::parseFromBSON(request.cmdObj)); ASSERT(!ssv.isInit()); ASSERT(ssv.isAuthoritative()); ASSERT_EQ(grid.shardRegistry()->getConfigServerConnectionString().toString(), ssv.getConfigServer().toString()); ASSERT_EQ(expectedShard.getHost(), ssv.getShardConnectionString().toString()); ASSERT_EQ(expectedNs.toString(), ssv.getNS().ns()); ASSERT_EQ(expectedChunkVersion.toString(), ssv.getNSVersion().toString()); return BSON("ok" << true); }); }
/** * Updates the remote cached version on the remote shard host (primary, in the case of replica * sets) if needed with a fully-qualified shard version for the given namespace: * config server(s) + shard name + shard version * * If no remote cached version has ever been set, an initial shard version is sent. * * If the namespace is empty and no version has ever been sent, the config server + shard name * is sent to the remote shard host to initialize the connection as coming from mongos. * NOTE: This initialization is *best-effort only*. Operations which wish to correctly version * must send the namespace. * * Config servers are special and are not (unless otherwise a shard) kept up to date with this * protocol. This is safe so long as config servers only contain unversioned collections. * * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB). * * @return true if we contacted the remote host */ bool checkShardVersion(DBClientBase* conn_in, const string& ns, ChunkManagerPtr refManager, bool authoritative, int tryNumber) { // TODO: cache, optimize, etc... // Empty namespaces are special - we require initialization but not versioning if (ns.size() == 0) { return initShardVersionEmptyNS(conn_in); } auto status = grid.catalogCache()->getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return false; } shared_ptr<DBConfig> conf = status.getValue(); DBClientBase* conn = getVersionable(conn_in); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ShardPtr primary; ChunkManagerPtr manager; if (authoritative) conf->getChunkManagerIfExists(ns, true); conf->getChunkManagerOrPrimary(ns, manager, primary); if (manager) { officialSequenceNumber = manager->getSequenceNumber(); } const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress()); uassert(ErrorCodes::ShardNotFound, str::stream() << conn->getServerAddress() << " is not recognized as a shard", shard); // Check this manager against the reference manager if (manager) { if (refManager && !refManager->compatibleWith(*manager, shard->getId())) { const ChunkVersion refVersion(refManager->getVersion(shard->getId())); const ChunkVersion currentVersion(manager->getVersion(shard->getId())); string msg(str::stream() << "manager (" << currentVersion.toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refVersion.toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard->getId() << " (" << shard->getConnString().toString() << ")"); throw SendStaleConfigException(ns, msg, refVersion, currentVersion); } } else if (refManager) { string msg(str::stream() << "not sharded (" << ((manager.get() == 0) ? string("<none>") : str::stream() << manager->getSequenceNumber()) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")"); throw SendStaleConfigException( ns, msg, refManager->getVersion(shard->getId()), ChunkVersion::UNSHARDED()); } // Do not send setShardVersion to collections on the config servers - this causes problems // when config servers are also shards and get SSV with conflicting names. // TODO: Make config servers regular shards if (primary && primary->getId() == "config") { return false; } // Has the ChunkManager been reloaded since the last time we updated the shard version over // this connection? If we've never updated the shard version, do so now. unsigned long long sequenceNumber = 0; if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) { if (sequenceNumber == officialSequenceNumber) { return false; } } ChunkVersion version = ChunkVersion(0, 0, OID()); if (manager) { version = manager->getVersion(shard->getId()); } LOG(1) << "setting shard version of " << version << " for " << ns << " on shard " << shard->toString(); LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber << ", current chunk manager iteration is " << officialSequenceNumber; BSONObj result; if (setShardVersion(*conn, ns, grid.catalogManager()->connectionString().toString(), version, manager.get(), authoritative, result)) { LOG(1) << " setShardVersion success: " << result; connectionShardStatus.setSequence(conn, ns, officialSequenceNumber); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if (result["need_authoritative"].trueValue()) massert(10428, "need_authoritative set but in authoritative mode already", !authoritative); if (!authoritative) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if (result["reloadConfig"].trueValue()) { if (result["version"].timestampTime() == Date_t()) { warning() << "reloading full configuration for " << conf->name() << ", connection state indicates significant version changes"; // reload db conf->reload(); } else { // reload config conf->getChunkManager(ns, true); } } const int maxNumTries = 7; if (tryNumber < maxNumTries) { LOG(tryNumber < (maxNumTries / 2) ? 1 : 0) << "going to retry checkShardVersion shard: " << shard->toString() << " " << result; sleepmillis(10 * tryNumber); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed shard: " << shard->toString() << " " << result; log() << " " << errmsg << endl; massert(10429, errmsg, 0); return true; }
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request) : _args(std::move(request)), _startTime() { invariant(!txn->lockState()->isLocked()); const auto& oss = OperationShardingState::get(txn); if (!oss.hasShardVersion()) { uasserted(ErrorCodes::InvalidOptions, "collection version is missing"); } // Even though the moveChunk command transmits a value in the operation's shardVersion field, // this value does not actually contain the shard version, but the global collection version. const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss()); log() << "Starting chunk migration for " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with expected collection version " << expectedCollectionVersion; // Now that the collection is locked, snapshot the metadata and fetch the latest versions ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); if (!refreshStatus.isOK()) { uasserted(refreshStatus.code(), str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " due to " << refreshStatus.toString()); } if (shardVersion.majorVersion() == 0) { // If the major version is zero, this means we do not have any chunks locally to migrate in // the first place uasserted(ErrorCodes::IncompatibleShardingMetadata, str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with zero shard version"); } // Snapshot the committed metadata from the time the migration starts { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); _committedMetadata = css->getMetadata(); } const ChunkVersion collectionVersion = _committedMetadata->getCollVersion(); if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) { throw SendStaleConfigException( _args.getNss().ns(), str::stream() << "cannot move chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " because collection may have been dropped. " << "current epoch: " << collectionVersion.epoch() << ", cmd epoch: " << expectedCollectionVersion.epoch(), expectedCollectionVersion, collectionVersion); } // With nonzero shard version, we must have a coll version >= our shard version invariant(collectionVersion >= shardVersion); // With nonzero shard version, we must have a shard key invariant(!_committedMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) { // If this assertion is hit, it means that whoever called the shard moveChunk command // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this // shard. It is a benign error and does not indicate data corruption. uasserted(40145, str::stream() << "Chunk with bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " is not owned by this shard."); } uassert(40146, str::stream() << "Unable to find chunk with the exact bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " at collection version " << collectionVersion.toString() << ". This indicates corrupted metadata.", origChunk.getMin().woCompare(_args.getMinKey()) == 0 && origChunk.getMax().woCompare(_args.getMaxKey()) == 0); }
Status ShardingState::doRefreshMetadata(OperationContext* txn, const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // shared_ptr<CollectionMetadata> beforeMetadata; { stdx::lock_guard<stdx::mutex> lk(_mutex); // We can't reload if sharding is not enabled - i.e. without a config server location if (!_enabled) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before sharding has been enabled"; warning() << errMsg; return Status(ErrorCodes::NotYetInitialized, errMsg); } // We also can't reload if a shard name has not yet been set. if (_shardName.empty()) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; warning() << errMsg; return Status(ErrorCodes::NotYetInitialized, errMsg); } CollectionMetadataMap::iterator it = _collMetadata.find(ns); if (it != _collMetadata.end()) { beforeMetadata = it->second; } } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if (beforeMetadata) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // // Determine whether we need to diff or fully reload // bool fullReload = false; if (!beforeMetadata) { // We don't have any metadata to reload from fullReload = true; } else if (useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch()) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG(0) << "remotely refreshing metadata for " << ns << (useRequestedVersion ? string(" with requested shard version ") + reqShardVersion.toString() : "") << (fullReload ? ", current shard version is " : " based on current shard version ") << beforeShardVersion << ", current metadata version is " << beforeCollVersion; string errMsg; MetadataLoader mdLoader; CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); shared_ptr<CollectionMetadata> remoteMetadata(remoteMetadataRaw); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata(grid.catalogManager(), ns, getShardName(), fullReload ? NULL : beforeMetadata.get(), remoteMetadataRaw); long long refreshMillis = refreshTimer.millis(); if (status.code() == ErrorCodes::NamespaceNotFound) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if (!status.isOK()) { warning() << "could not remotely refresh metadata for " << ns << causedBy(status.reason()); return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if (remoteMetadata) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // shared_ptr<CollectionMetadata> afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // Exclusive collection lock needed since we're now potentially changing the metadata, // and don't want reads/writes to be ongoing. ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X); // // Get the metadata now that the load has completed // stdx::lock_guard<stdx::mutex> lk(_mutex); // Don't reload if our config server has changed or sharding is no longer enabled if (!_enabled) { string errMsg = str::stream() << "could not refresh metadata for " << ns << ", sharding is no longer enabled"; warning() << errMsg; return Status(ErrorCodes::NotYetInitialized, errMsg); } CollectionMetadataMap::iterator it = _collMetadata.find(ns); if (it != _collMetadata.end()) afterMetadata = it->second; if (afterMetadata) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks(afterMetadata.get(), remoteMetadataRaw); if (!status.isOK()) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy(status.reason()); return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion); if (choice == ChunkVersion::VersionChoice_Remote) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if (!afterCollVersion.epoch().isSet()) { // First metadata load installType = InstallType_New; dassert(it == _collMetadata.end()); _collMetadata.insert(make_pair(ns, remoteMetadata)); } else if (remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch()) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert(it != _collMetadata.end()); it->second = remoteMetadata; } else if (remoteCollVersion.epoch().isSet()) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert(it != _collMetadata.end()); it->second = remoteMetadata; } else { dassert(!remoteCollVersion.epoch().isSet()); // Drop detected installType = InstallType_Drop; _collMetadata.erase(it); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string localShardVersionMsg = beforeShardVersion.epoch() == afterShardVersion.epoch() ? afterShardVersion.toString() : beforeShardVersion.toString() + " / " + afterShardVersion.toString(); if (choice == ChunkVersion::VersionChoice_Unknown) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << " (loaded shard version : " << remoteShardVersion.toString() << ", stored shard versions : " << localShardVersionMsg << ", took " << refreshMillis << "ms)"; warning() << errMsg; return Status(ErrorCodes::RemoteChangeDetected, errMsg); } if (choice == ChunkVersion::VersionChoice_Local) { LOG(0) << "metadata of collection " << ns << " already up to date (shard version : " << afterShardVersion.toString() << ", took " << refreshMillis << "ms)"; return Status::OK(); } dassert(choice == ChunkVersion::VersionChoice_Remote); switch (installType) { case InstallType_New: LOG(0) << "collection " << ns << " was previously unsharded" << ", new metadata loaded with shard version " << remoteShardVersion; break; case InstallType_Update: LOG(0) << "updating metadata for " << ns << " from shard version " << localShardVersionMsg << " to shard version " << remoteShardVersion; break; case InstallType_Replace: LOG(0) << "replacing metadata for " << ns << " at shard version " << localShardVersionMsg << " with a new epoch (shard version " << remoteShardVersion << ")"; break; case InstallType_Drop: LOG(0) << "dropping metadata for " << ns << " at shard version " << localShardVersionMsg << ", took " << refreshMillis << "ms"; break; default: verify(false); break; } if (installType != InstallType_Drop) { LOG(0) << "collection version was loaded at version " << remoteCollVersion << ", took " << refreshMillis << "ms"; } return Status::OK(); }
void WriteBackListener::run() { int secsToSleep = 0; scoped_ptr<ChunkVersion> lastNeededVersion; int lastNeededCount = 0; bool needsToReloadShardInfo = false; while ( ! inShutdown() ) { if ( ! Shard::isAShardNode( _addr ) ) { LOG(1) << _addr << " is not a shard node" << endl; sleepsecs( 60 ); continue; } try { if (needsToReloadShardInfo) { // It's possible this shard was removed Shard::reloadShardInfo(); needsToReloadShardInfo = false; } scoped_ptr<ScopedDbConnection> conn( ScopedDbConnection::getInternalScopedDbConnection( _addr ) ); BSONObj result; { BSONObjBuilder cmd; cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data if ( ! conn->get()->runCommand( "admin" , cmd.obj() , result ) ) { result = result.getOwned(); log() << "writebacklisten command failed! " << result << endl; conn->done(); continue; } } conn->done(); LOG(1) << "writebacklisten result: " << result << endl; BSONObj data = result.getObjectField( "data" ); if ( data.getBoolField( "writeBack" ) ) { string ns = data["ns"].valuestrsafe(); ConnectionIdent cid( "" , 0 ); OID wid; if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) { string s = ""; if ( data["instanceIdent"].type() == String ) s = data["instanceIdent"].String(); cid = ConnectionIdent( s , data["connectionId"].numberLong() ); wid = data["id"].OID(); } else { warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl; } int len; // not used, but needed for next call Message msg( (void*)data["msg"].binData( len ) , false ); massert( 10427 , "invalid writeback message" , msg.header()->valid() ); DBConfigPtr db = grid.getDBConfig( ns ); ChunkVersion needVersion = ChunkVersion::fromBSON( data, "version" ); // // TODO: Refactor the sharded strategy to correctly handle all sharding state changes itself, // we can't rely on WBL to do this for us b/c anything could reset our state in-between. // We should always reload here for efficiency when possible, but staleness is also caught in the // loop below. // ChunkManagerPtr manager; ShardPtr primary; db->getChunkManagerOrPrimary( ns, manager, primary ); ChunkVersion currVersion; if( manager ) currVersion = manager->getVersion(); LOG(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString() << " mine : " << currVersion.toString() << endl; LOG(1) << msg.toString() << endl; // // We should reload only if we need to update our version to be compatible *and* we // haven't already done so. This avoids lots of reloading when we remove/add a sharded collection // bool alreadyReloaded = lastNeededVersion && lastNeededVersion->isEquivalentTo( needVersion ); if( alreadyReloaded ){ LOG(1) << "wbl already reloaded config information for version " << needVersion << ", at version " << currVersion << endl; } else if( lastNeededVersion ) { log() << "new version change detected to " << needVersion.toString() << ", " << lastNeededCount << " writebacks processed at " << lastNeededVersion->toString() << endl; lastNeededCount = 0; } // // Set our lastNeededVersion for next time // lastNeededVersion.reset( new ChunkVersion( needVersion ) ); lastNeededCount++; // // Determine if we should reload, if so, reload // bool shouldReload = ! needVersion.isWriteCompatibleWith( currVersion ) && ! alreadyReloaded; if( shouldReload && currVersion.isSet() && needVersion.isSet() && currVersion.hasCompatibleEpoch( needVersion ) ) { // // If we disagree about versions only, reload the chunk manager // db->getChunkManagerIfExists( ns, true ); } else if( shouldReload ){ // // If we disagree about anything else, reload the full db // warning() << "reloading config data for " << db->getName() << ", " << "wanted version " << needVersion.toString() << " but currently have version " << currVersion.toString() << endl; db->reload(); } // do request and then call getLastError // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError BSONObj gle; int attempts = 0; while ( true ) { attempts++; try { Request r( msg , 0 ); r.init(); r.d().reservedField() |= Reserved_FromWriteback; ClientInfo * ci = r.getClientInfo(); if (!noauth) { ci->getAuthorizationManager()->grantInternalAuthorization( "_writebackListener"); } ci->noAutoSplit(); r.process( attempts ); ci->newRequest(); // this so we flip prev and cur shards BSONObjBuilder b; string errmsg; if ( ! ci->getLastError( "admin", BSON( "getLastError" << 1 ), b, errmsg, true ) ) { b.appendBool( "commandFailed" , true ); if( ! b.hasField( "errmsg" ) ){ b.append( "errmsg", errmsg ); gle = b.obj(); } else if( errmsg.size() > 0 ){ // Rebuild GLE object with errmsg // TODO: Make this less clumsy by improving GLE interface gle = b.obj(); if( gle["errmsg"].type() == String ){ BSONObj gleNoErrmsg = gle.filterFieldsUndotted( BSON( "errmsg" << 1 ), false ); BSONObjBuilder bb; bb.appendElements( gleNoErrmsg ); bb.append( "errmsg", gle["errmsg"].String() + " ::and:: " + errmsg ); gle = bb.obj().getOwned(); } } } else{ gle = b.obj(); } if ( gle["code"].numberInt() == 9517 ) { log() << "new version change detected, " << lastNeededCount << " writebacks processed previously" << endl; lastNeededVersion.reset(); lastNeededCount = 1; log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl; LOG(1) << "writeback error : " << gle << endl; // // Bringing this in line with the similar retry logic elsewhere // // TODO: Reloading the chunk manager may not help if we dropped a // collection, but we don't actually have that info in the writeback // error // if( attempts <= 2 ){ db->getChunkManagerIfExists( ns, true ); } else{ versionManager.forceRemoteCheckShardVersionCB( ns ); sleepsecs( attempts - 1 ); } uassert( 15884, str::stream() << "Could not reload chunk manager after " << attempts << " attempts.", attempts <= 4 ); continue; } ci->clearSinceLastGetError(); } catch ( DBException& e ) { error() << "error processing writeback: " << e << endl; BSONObjBuilder b; e.getInfo().append( b, "err", "code" ); gle = b.obj(); } break; } { scoped_lock lk( _seenWritebacksLock ); WBStatus& s = _seenWritebacks[cid]; s.id = wid; s.gle = gle; } } else if ( result["noop"].trueValue() ) { // no-op } else { log() << "unknown writeBack result: " << result << endl; } secsToSleep = 0; continue; } catch ( std::exception& e ) { // Attention! Do not call any method that would throw an exception // (or assert) in this block. if ( inShutdown() ) { // we're shutting down, so just clean up return; } log() << "WriteBackListener exception : " << e.what() << endl; needsToReloadShardInfo = true; } catch ( ... ) { log() << "WriteBackListener uncaught exception!" << endl; } secsToSleep++; sleepsecs(secsToSleep); if ( secsToSleep > 10 ) secsToSleep = 0; } log() << "WriteBackListener exiting : address no longer in cluster " << _addr; }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){ result.append( "initialized", true ); // Send back wire version to let mongos know what protocol we can speak result.append( "minWireVersion", minWireVersion ); result.append( "maxWireVersion", maxWireVersion ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } if( ! ChunkVersion::canParseBSON( cmdObj, "version" ) ){ errmsg = "need to specify version"; return false; } const ChunkVersion version = ChunkVersion::fromBSON( cmdObj, "version" ); // step 3 const ChunkVersion oldVersion = info->getVersion(ns); const ChunkVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isWriteCompatibleWith( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isWriteCompatibleWith( version ) ) { if ( oldVersion < globalVersion && oldVersion.hasCompatibleEpoch(globalVersion) ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } // Cases below all either return OR fall-through to remote metadata reload. if ( version.isSet() || !globalVersion.isSet() ) { // Not Dropping // TODO: Refactor all of this if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } // TODO: Refactor all of this if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } // Fall through to metadata reload below } else { // Dropping if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } // Fall through to metadata reload below } ChunkVersion currVersion; Status status = shardingState.refreshMetadataIfNeeded( ns, version, &currVersion ); if (!status.isOK()) { // The reload itself was interrupted or confused here errmsg = str::stream() << "could not refresh metadata for " << ns << " with requested shard version " << version.toString() << ", stored shard version is " << currVersion.toString() << causedBy( status.reason() ); warning() << errmsg << endl; result.append( "ns" , ns ); version.addToBSON( result, "version" ); currVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig", true ); return false; } else if ( !version.isWriteCompatibleWith( currVersion ) ) { // We reloaded a version that doesn't match the version mongos was trying to // set. errmsg = str::stream() << "requested shard version differs from" << " config shard version for " << ns << ", requested version is " << version.toString() << " but found version " << currVersion.toString(); OCCASIONALLY warning() << errmsg << endl; // WARNING: the exact fields below are important for compatibility with mongos // version reload. result.append( "ns" , ns ); currVersion.addToBSON( result, "globalVersion" ); // If this was a reset of a collection or the last chunk moved out, inform mongos to // do a full reload. if (currVersion.epoch() != version.epoch() || !currVersion.isSet() ) { result.appendBool( "reloadConfig", true ); // Zero-version also needed to trigger full mongos reload, sadly // TODO: Make this saner, and less impactful (full reload on last chunk is bad) ChunkVersion( 0, 0, OID() ).addToBSON( result, "version" ); // For debugging version.addToBSON( result, "origVersion" ); } else { version.addToBSON( result, "version" ); } return false; } info->setVersion( ns , version ); return true; }
Status ShardingState::doRefreshMetadata( const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion ) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // CollectionMetadataPtr beforeMetadata; string shardName; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) beforeMetadata = it->second; shardName = _shardName; } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if ( beforeMetadata ) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // We can't reload without a shard name. Must check here before loading, since shard name // may have changed if we checked it earlier and released the _mutex. if ( shardName.empty() ) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; LOG( 0 ) << errMsg << endl; return Status( ErrorCodes::IllegalOperation, errMsg ); } // // Determine whether we need to diff or fully reload // bool fullReload = false; if ( !beforeMetadata ) { // We don't have any metadata to reload from fullReload = true; } else if ( useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch() ) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG( 0 ) << "remotely refreshing metadata for " << ns << ( useRequestedVersion ? string( " with requested shard version " ) + reqShardVersion.toString() : "" ) << ( fullReload ? ", current shard version is " : " based on current shard version " ) << beforeShardVersion << ", current metadata version is " << beforeCollVersion << endl; string errMsg; ConnectionString configServerLoc = ConnectionString::parse( _configServer, errMsg ); MetadataLoader mdLoader( configServerLoc ); CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); CollectionMetadataPtr remoteMetadata( remoteMetadataRaw ); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata( ns, shardName, ( fullReload ? NULL : beforeMetadata.get() ), remoteMetadataRaw ); long long refreshMillis = refreshTimer.millis(); if ( status.code() == ErrorCodes::NamespaceNotFound ) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if ( !status.isOK() ) { warning() << "could not remotely refresh metadata for " << ns << causedBy( status.reason() ) << endl; return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if ( remoteMetadata ) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // CollectionMetadataPtr afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // DBLock needed since we're now potentially changing the metadata, and don't want // reads/writes to be ongoing. Lock::DBWrite writeLk( ns ); // // Get the metadata now that the load has completed // scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) afterMetadata = it->second; if ( afterMetadata ) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks( afterMetadata.get(), remoteMetadataRaw ); if ( !status.isOK() ) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy( status.reason() ) << endl; return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion ); if ( choice == ChunkVersion::VersionChoice_Remote ) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if ( !afterCollVersion.epoch().isSet() ) { // First metadata load installType = InstallType_New; dassert( it == _collMetadata.end() ); _collMetadata.insert( make_pair( ns, remoteMetadata ) ); } else if ( remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch() ) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else if ( remoteCollVersion.epoch().isSet() ) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else { dassert( !remoteCollVersion.epoch().isSet() ); // Drop detected installType = InstallType_Drop; _collMetadata.erase( it ); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string versionMsg = str::stream() << " (loaded metadata version : " << remoteCollVersion.toString() << ( beforeCollVersion.epoch() == afterCollVersion.epoch() ? string( ", stored version : " ) + afterCollVersion.toString() : string( ", stored versions : " ) + beforeCollVersion.toString() + " / " + afterCollVersion.toString() ) << ", took " << refreshMillis << "ms)"; if ( choice == ChunkVersion::VersionChoice_Unknown ) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << versionMsg; warning() << errMsg << endl; return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } if ( choice == ChunkVersion::VersionChoice_Local ) { LOG( 0 ) << "newer metadata not found for " << ns << versionMsg << endl; return Status::OK(); } dassert( choice == ChunkVersion::VersionChoice_Remote ); switch( installType ) { case InstallType_New: LOG( 0 ) << "loaded new metadata for " << ns << versionMsg << endl; break; case InstallType_Update: LOG( 0 ) << "loaded newer metadata for " << ns << versionMsg << endl; break; case InstallType_Replace: LOG( 0 ) << "replacing metadata for " << ns << versionMsg << endl; break; case InstallType_Drop: LOG( 0 ) << "dropping metadata for " << ns << versionMsg << endl; break; default: verify( false ); break; } return Status::OK(); }
bool WriteBatchExecutor::doWrite( const string& ns, const BatchItemRef& itemRef, CurOp* currentOp, WriteStats* stats, BSONObj* upsertedID, BatchedErrorDetail* error ) { const BatchedCommandRequest& request = *itemRef.getRequest(); int index = itemRef.getItemIndex(); // // Check our shard version if we need to (must be in the write lock) // if ( shardingState.enabled() && request.isShardVersionSet() && !ChunkVersion::isIgnoredVersion( request.getShardVersion() ) ) { Lock::assertWriteLocked( ns ); CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( ns ); ChunkVersion shardVersion = metadata ? metadata->getShardVersion() : ChunkVersion::UNSHARDED(); if ( !request.getShardVersion() // .isWriteCompatibleWith( shardVersion ) ) { // Write stale error to results error->setErrCode( ErrorCodes::StaleShardVersion ); BSONObjBuilder infoB; shardVersion.addToBSON( infoB, "vWanted" ); error->setErrInfo( infoB.obj() ); string errMsg = mongoutils::str::stream() << "stale shard version detected before write, received " << request.getShardVersion().toString() << " but local version is " << shardVersion.toString(); error->setErrMessage( errMsg ); return false; } } // // Not stale, do the actual write // if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) { // Insert return doInsert( ns, request.getInsertRequest()->getDocumentsAt( index ), currentOp, stats, error ); } else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) { // Update return doUpdate( ns, *request.getUpdateRequest()->getUpdatesAt( index ), currentOp, stats, upsertedID, error ); } else { dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ); // Delete return doDelete( ns, *request.getDeleteRequest()->getDeletesAt( index ), currentOp, stats, error ); } }
Status ShardingCatalogClientImpl::applyChunkOpsDeprecated(OperationContext* opCtx, const BSONArray& updateOps, const BSONArray& preCondition, const NamespaceString& nss, const ChunkVersion& lastChunkVersion, const WriteConcernOptions& writeConcern, repl::ReadConcernLevel readConcern) { invariant(serverGlobalParams.clusterRole == ClusterRole::ConfigServer || (readConcern == repl::ReadConcernLevel::kMajorityReadConcern && writeConcern.wMode == WriteConcernOptions::kMajority)); BSONObj cmd = BSON("applyOps" << updateOps << "preCondition" << preCondition << WriteConcernOptions::kWriteConcernField << writeConcern.toBSON()); auto response = Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "config", cmd, Shard::RetryPolicy::kIdempotent); if (!response.isOK()) { return response.getStatus(); } Status status = response.getValue().commandStatus.isOK() ? std::move(response.getValue().writeConcernStatus) : std::move(response.getValue().commandStatus); // TODO (Dianna) This fail point needs to be reexamined when CommitChunkMigration is in: // migrations will no longer be able to exercise it, so split or merge will need to do so. // SERVER-22659. if (MONGO_FAIL_POINT(failApplyChunkOps)) { status = Status(ErrorCodes::InternalError, "Failpoint 'failApplyChunkOps' generated error"); } if (!status.isOK()) { string errMsg; // This could be a blip in the network connectivity. Check if the commit request made it. // // If all the updates were successfully written to the chunks collection, the last // document in the list of updates should be returned from a query to the chunks // collection. The last chunk can be identified by namespace and version number. warning() << "chunk operation commit failed and metadata will be revalidated" << causedBy(redact(status)); // Look for the chunk in this shard whose version got bumped. We assume that if that // mod made it to the config server, then transaction was successful. BSONObjBuilder query; lastChunkVersion.appendLegacyWithField(&query, ChunkType::lastmod()); query.append(ChunkType::ns(), nss.ns()); auto chunkWithStatus = getChunks(opCtx, query.obj(), BSONObj(), 1, nullptr, readConcern); if (!chunkWithStatus.isOK()) { errMsg = str::stream() << "getChunks function failed, unable to validate chunk " << "operation metadata: " << chunkWithStatus.getStatus().toString() << ". applyChunkOpsDeprecated failed to get confirmation " << "of commit. Unable to save chunk ops. Command: " << cmd << ". Result: " << response.getValue().response; return status.withContext(errMsg); }; const auto& newestChunk = chunkWithStatus.getValue(); if (newestChunk.empty()) { errMsg = str::stream() << "chunk operation commit failed: version " << lastChunkVersion.toString() << " doesn't exist in namespace: " << nss.ns() << ". Unable to save chunk ops. Command: " << cmd << ". Result: " << response.getValue().response; return status.withContext(errMsg); }; invariant(newestChunk.size() == 1); return Status::OK(); } return Status::OK(); }