Status enforceLegacyWriteConcern( MultiCommandDispatch* dispatcher, const StringData& dbName, const BSONObj& options, const HostOpTimeMap& hostOpTimes, vector<LegacyWCResponse>* legacyWCResponses ) { if ( hostOpTimes.empty() ) { return Status::OK(); } for ( HostOpTimeMap::const_iterator it = hostOpTimes.begin(); it != hostOpTimes.end(); ++it ) { const ConnectionString& shardEndpoint = it->first; const HostOpTime hot = it->second; const OpTime& opTime = hot.opTime; const OID& electionId = hot.electionId; LOG( 3 ) << "enforcing write concern " << options << " on " << shardEndpoint.toString() << " at opTime " << opTime.toStringPretty() << " with electionID " << electionId; BSONObj gleCmd = buildGLECmdWithOpTime( options, opTime, electionId ); RawBSONSerializable gleCmdSerial( gleCmd ); dispatcher->addCommand( shardEndpoint, dbName, gleCmdSerial ); } dispatcher->sendAll(); vector<Status> failedStatuses; while ( dispatcher->numPending() > 0 ) { ConnectionString shardEndpoint; RawBSONSerializable gleResponseSerial; Status dispatchStatus = dispatcher->recvAny( &shardEndpoint, &gleResponseSerial ); if ( !dispatchStatus.isOK() ) { // We need to get all responses before returning failedStatuses.push_back( dispatchStatus ); continue; } BSONObj gleResponse = BatchSafeWriter::stripNonWCInfo( gleResponseSerial.toBSON() ); // Use the downconversion tools to determine if this GLE response is ok, a // write concern error, or an unknown error we should immediately abort for. BatchSafeWriter::GLEErrors errors; Status extractStatus = BatchSafeWriter::extractGLEErrors( gleResponse, &errors ); if ( !extractStatus.isOK() ) { failedStatuses.push_back( extractStatus ); continue; } LegacyWCResponse wcResponse; wcResponse.shardHost = shardEndpoint.toString(); wcResponse.gleResponse = gleResponse; if ( errors.wcError.get() ) { wcResponse.errToReport = errors.wcError->getErrMessage(); } legacyWCResponses->push_back( wcResponse ); } if ( failedStatuses.empty() ) { return Status::OK(); } StringBuilder builder; builder << "could not enforce write concern"; for ( vector<Status>::const_iterator it = failedStatuses.begin(); it != failedStatuses.end(); ++it ) { const Status& failedStatus = *it; if ( it == failedStatuses.begin() ) { builder << causedBy( failedStatus.toString() ); } else { builder << ":: and ::" << failedStatus.toString(); } } return Status( failedStatuses.size() == 1u ? failedStatuses.front().code() : ErrorCodes::MultipleErrorsOccurred, builder.str() ); }
void DBException::traceIfNeeded( const DBException& e ) { if( traceExceptions && ! inShutdown() ){ warning() << "DBException thrown" << causedBy( e ) << endl; printStackTrace(); } }
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx, ChunkManager* manager, Chunk* chunk, long dataWritten) { // Disable lastError tracking so that any errors, which occur during auto-split do not get // bubbled up on the client connection doing a write LastError::Disabled disableLastError(&LastError::get(opCtx->getClient())); const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); const bool minIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin())); const bool maxIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax())); const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten); const uint64_t desiredChunkSize = calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks()); if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) { return; } const NamespaceString nss(manager->getns()); if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << nss; return; } TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets)); const ChunkRange chunkRange(chunk->getMin(), chunk->getMax()); try { // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); if (!balancerConfig->getShouldAutoSplit()) { return; } LOG(1) << "about to initiate autosplit: " << redact(chunk->toString()) << " dataWritten: " << chunkBytesWritten << " desiredChunkSize: " << desiredChunkSize; const uint64_t chunkSizeToUse = [&]() { const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2; if (estNumSplitPoints >= kTooManySplitPoints) { // The current desired chunk size will split the chunk into lots of small chunk and // at the worst case this can result into thousands of chunks. So check and see if a // bigger value can be used. return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes()); } else { return desiredChunkSize; } }(); auto splitPoints = uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), chunkRange, chunkSizeToUse, boost::none)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have // between half the chunk size to full chunk size so there is no need to split yet chunk->clearBytesWritten(); return; } if (minIsInf || maxIsInf) { // We don't want to reset _dataWritten since we want to check the other side right away } else { // We're splitting, so should wait a bit chunk->clearBytesWritten(); } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) { if (minIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); } } else if (maxIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); } } } const auto suggestedMigrateChunk = uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), manager->getVersion(), chunkRange, splitPoints)); // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk // to balance const bool shouldBalance = [&]() { if (!balancerConfig->shouldBalanceForAutoSplit()) return false; auto collStatus = Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns()); if (!collStatus.isOK()) { log() << "Auto-split for " << nss << " failed to load collection metadata" << causedBy(redact(collStatus.getStatus())); return false; } return collStatus.getValue().value.getAllowBalance(); }(); log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into " << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")" << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Reload the chunk manager after the split auto routingInfo = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!shouldBalance || !suggestedMigrateChunk) { return; } // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation( suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove)); // Ensure the collection gets reloaded because of the move Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } catch (const DBException& ex) { chunk->clearBytesWritten(); if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) { log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex) << ", going to invalidate routing table entry for " << nss; Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } } }
auto_ptr<DBClientCursor> DBClientReplicaSet::query(const string &ns, Query query, int nToReturn, int nToSkip, const BSONObj *fieldsToReturn, int queryOptions, int batchSize) { if ( queryOptions & QueryOption_SlaveOk ) { // we're ok sending to a slave // we'll try 2 slaves before just using master // checkSlave will try a different slave automatically after a failure for ( int i=0; i<3; i++ ) { try { return checkSlaveQueryResult( checkSlave()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize) ); } catch ( DBException &e ) { LOG(1) << "can't query replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl; } } } return checkMaster()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize); }
void DBClientReplicaSet::say( Message& toSend, bool isRetry ) { if( ! isRetry ) _lazyState = LazyState(); int lastOp = -1; bool slaveOk = false; if ( ( lastOp = toSend.operation() ) == dbQuery ) { // TODO: might be possible to do this faster by changing api DbMessage dm( toSend ); QueryMessage qm( dm ); if ( ( slaveOk = ( qm.queryOptions & QueryOption_SlaveOk ) ) ) { for ( int i = _lazyState._retries; i < 3; i++ ) { try { DBClientConnection* slave = checkSlave(); slave->say( toSend ); _lazyState._lastOp = lastOp; _lazyState._slaveOk = slaveOk; _lazyState._retries = i; _lazyState._lastClient = slave; return; } catch ( DBException &e ) { LOG(1) << "can't callLazy replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl; } } } } DBClientConnection* master = checkMaster(); master->say( toSend ); _lazyState._lastOp = lastOp; _lazyState._slaveOk = slaveOk; _lazyState._retries = 3; _lazyState._lastClient = master; return; }
StatusWith<vector<MigrateInfo>> Balancer::_getCandidateChunks(OperationContext* txn) { vector<CollectionType> collections; Status collsStatus = grid.catalogManager(txn)->getCollections(txn, nullptr, &collections, nullptr); if (!collsStatus.isOK()) { return collsStatus; } if (collections.empty()) { return vector<MigrateInfo>(); } // Get a list of all the shards that are participating in this balance round along with any // maximum allowed quotas and current utilization. We get the latter by issuing // db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. auto shardInfoStatus = DistributionStatus::populateShardInfoMap(txn); if (!shardInfoStatus.isOK()) { return shardInfoStatus.getStatus(); } const ShardInfoMap shardInfo(std::move(shardInfoStatus.getValue())); if (shardInfo.size() < 2) { return vector<MigrateInfo>(); } OCCASIONALLY warnOnMultiVersion(shardInfo); std::vector<MigrateInfo> candidateChunks; // For each collection, check if the balancing policy recommends moving anything around. for (const auto& coll : collections) { // Skip collections for which balancing is disabled const NamespaceString& nss = coll.getNs(); if (!coll.getAllowBalance()) { LOG(1) << "Not balancing collection " << nss << "; explicitly disabled."; continue; } std::vector<ChunkType> allNsChunks; Status status = grid.catalogManager(txn)->getChunks(txn, BSON(ChunkType::ns(nss.ns())), BSON(ChunkType::min() << 1), boost::none, // all chunks &allNsChunks, nullptr); if (!status.isOK()) { warning() << "failed to load chunks for ns " << nss.ns() << causedBy(status); continue; } set<BSONObj> allChunkMinimums; map<string, vector<ChunkType>> shardToChunksMap; for (const ChunkType& chunk : allNsChunks) { allChunkMinimums.insert(chunk.getMin().getOwned()); vector<ChunkType>& chunksList = shardToChunksMap[chunk.getShard()]; chunksList.push_back(chunk); } if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << nss.ns() << ")"; continue; } for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) { // This loop just makes sure there is an entry in shardToChunksMap for every shard shardToChunksMap[i->first]; } DistributionStatus distStatus(shardInfo, shardToChunksMap); // TODO: TagRange contains all the information from TagsType except for the namespace, // so maybe the two can be merged at some point in order to avoid the // transformation below. vector<TagRange> ranges; { vector<TagsType> collectionTags; uassertStatusOK( grid.catalogManager(txn)->getTagsForCollection(txn, nss.ns(), &collectionTags)); for (const auto& tt : collectionTags) { ranges.push_back( TagRange(tt.getMinKey().getOwned(), tt.getMaxKey().getOwned(), tt.getTag())); uassert(16356, str::stream() << "tag ranges not valid for: " << nss.ns(), distStatus.addTagRange(ranges.back())); } } auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString()); if (!statusGetDb.isOK()) { warning() << "could not load db config to balance collection [" << nss.ns() << "]: " << statusGetDb.getStatus(); continue; } shared_ptr<DBConfig> cfg = statusGetDb.getValue(); // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. shared_ptr<ChunkManager> cm = cfg->getChunkManagerIfExists(txn, nss.ns(), true); if (!cm) { warning() << "could not load chunks to balance " << nss.ns() << " collection"; continue; } // Loop through tags to make sure no chunk spans tags. Split on tag min for all chunks. bool didAnySplits = false; for (const TagRange& range : ranges) { BSONObj min = cm->getShardKeyPattern().getKeyPattern().extendRangeBound(range.min, false); if (allChunkMinimums.count(min) > 0) { continue; } didAnySplits = true; log() << "nss: " << nss.ns() << " need to split on " << min << " because there is a range there"; vector<BSONObj> splitPoints; splitPoints.push_back(min); shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, min); Status status = c->multiSplit(txn, splitPoints, NULL); if (!status.isOK()) { error() << "split failed: " << status; } else { LOG(1) << "split worked"; } break; } if (didAnySplits) { // State change, just wait till next round continue; } shared_ptr<MigrateInfo> migrateInfo( BalancerPolicy::balance(nss.ns(), distStatus, _balancedLastTime)); if (migrateInfo) { candidateChunks.emplace_back(*migrateInfo); } } return candidateChunks; }
virtual void process( Message& m , AbstractMessagingPort* p , LastError * le) { verify( p ); Request r( m , p ); verify( le ); lastError.startRequest( m , le ); try { r.init(); r.process(); // Release connections after non-write op if ( ShardConnection::releaseConnectionsAfterResponse && r.expectResponse() ) { LOG(2) << "release thread local connections back to pool" << endl; ShardConnection::releaseMyConnections(); } } catch ( AssertionException & e ) { LOG( e.isUserAssertion() ? 1 : 0 ) << "AssertionException while processing op type : " << m.operation() << " to : " << r.getns() << causedBy(e) << endl; le->raiseError( e.getCode() , e.what() ); m.header()->id = r.id(); if ( r.expectResponse() ) { BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() ); replyToQuery( ResultFlag_ErrSet, p , m , err ); } } catch ( DBException& e ) { // note that e.toString() is more detailed on a SocketException than // e.what(). we should think about what is the right level of detail both // for logging and return code. log() << "DBException in process: " << e.what() << endl; le->raiseError( e.getCode() , e.what() ); m.header()->id = r.id(); if ( r.expectResponse() ) { BSONObjBuilder b; b.append("$err",e.what()).append("code",e.getCode()); if( !e._shard.empty() ) { b.append("shard",e._shard); } replyToQuery( ResultFlag_ErrSet, p , m , b.obj() ); } } }
std::string causedBy( const DBException& e ){ return causedBy( e.toString() ); }
std::string causedBy( const std::exception& e ) { return causedBy( e.what() ); }
/** * Performs sanity check on the given connection string on whether the seed list * is consistent with the view of the set using replSetGetStatus. */ bool addReplSetShardCheck( const ConnectionString& servers, string* errMsg ) { bool ok = false; BSONObj replSetStat; try { ScopedDbConnection newShardConn(servers.toString()); ok = newShardConn->runCommand( "admin", BSON( "replSetGetStatus" << 1 ), replSetStat ); newShardConn.done(); } catch ( const DBException& ex ) { *errMsg = str::stream() << "Error encountered while checking status of " << servers.toString() << ": " << causedBy( ex ); } if( !ok ) { if ( replSetStat["info"].str() == "configsvr" ) { *errMsg = "the specified mongod is a --configsvr and " "should thus not be a shard server"; } else { *errMsg = str::stream() << "error encountered calling replSetGetStatus: " << replSetStat; } return false; } // if the shard has only one host, make sure it is not part of a replica set string setName = replSetStat["set"].str(); string commandSetName = servers.getSetName(); if ( commandSetName.empty() && ! setName.empty() ) { *errMsg = str::stream() << "host is part of set: " << setName << " use replica set url format <setname>/<server1>,<server2>,...."; return false; } if ( !commandSetName.empty() && setName.empty() ) { *errMsg = str::stream() << "host did not return a set name, " << "is the replica set still initializing?" << replSetStat; return false; } // if the shard is part of replica set, make sure it is the right one if ( ! commandSetName.empty() && ( commandSetName != setName ) ) { *errMsg = str::stream() << "host is part of a different set: " << setName; return false; } // if the shard is part of a replica set, make sure all the hosts mentioned in // 'servers' are part of the set. It is fine if not all members of the set // are present in 'servers'. bool foundAll = true; string offendingHost; if ( ! commandSetName.empty() ) { set<string> hostSet; BSONElement membersElem( replSetStat["members"] ); if ( membersElem.type() == Array ) { BSONArrayIteratorSorted iter( BSONArray( membersElem.Obj() )); while ( iter.more() ) { hostSet.insert( iter.next()["name"].str() ); // host:port } vector<HostAndPort> hosts = servers.getServers(); for ( size_t i = 0 ; i < hosts.size() ; i++ ) { if (!hosts[i].hasPort()) { hosts[i].setPort(CmdLine::DefaultDBPort); } string host = hosts[i].toString(); // host:port if ( hostSet.find( host ) == hostSet.end() ) { offendingHost = host; foundAll = false; break; } } } if ( hostSet.empty() ) { *errMsg = "replSetGetStatus returned an empty set. " " Please wait for the set to initialize and try again."; return false; } } if ( ! foundAll ) { *errMsg = str::stream() << "in seed list " << servers.toString() << ", host " << offendingHost << " does not belong to replica set " << setName; return false; } return true; }
bool Grid::addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg ) { // name can be NULL, so provide a dummy one here to avoid testing it elsewhere string nameInternal; if ( ! name ) { name = &nameInternal; } ReplicaSetMonitorPtr rsMonitor; // Check whether the host (or set) exists and run several sanity checks on this request. // There are two set of sanity checks: making sure adding this particular shard is consistent // with the replica set state (if it exists) and making sure this shards databases can be // brought into the grid without conflict. if ( servers.type() == ConnectionString::SYNC ) { errMsg = "can't use sync cluster as a shard for replica set, " "have to use <setname>/<server1>,<server2>,..."; return false; } vector<string> dbNames; try { bool ok = false; { ScopedDbConnection newShardConn(servers.toString()); BSONObj resIsMongos; ok = newShardConn->runCommand( "admin", BSON( "isdbgrid" << 1 ), resIsMongos ); newShardConn.done(); } // should return ok=0, cmd not found if it's a normal mongod if ( ok ) { errMsg = "can't add a mongos process as a shard"; return false; } if ( servers.type() == ConnectionString::SET ) { if (!addReplSetShardCheck( servers, &errMsg )) { return false; } // shard name defaults to the name of the replica set if ( name->empty() && !servers.getSetName().empty() ) { *name = servers.getSetName(); } } // In order to be accepted as a new shard, that mongod must not have any database name // that exists already in any other shards. If that test passes, the new shard's // databases are going to be entered as non-sharded db's whose primary is the // newly added shard. BSONObj resListDB; { ScopedDbConnection newShardConn(servers.toString()); ok = newShardConn->runCommand( "admin", BSON( "listDatabases" << 1 ), resListDB ); newShardConn.done(); } if ( !ok ) { errMsg = str::stream() << "failed listing " << servers.toString() << "'s databases:" << resListDB;; return false; } BSONObjIterator i( resListDB["databases"].Obj() ); while ( i.more() ) { BSONObj dbEntry = i.next().Obj(); const string& dbName = dbEntry["name"].String(); if ( _isSpecialLocalDB( dbName ) ) { // 'local', 'admin', and 'config' are system DBs and should be excluded here continue; } else { dbNames.push_back( dbName ); } } if ( servers.type() == ConnectionString::SET ) { rsMonitor = ReplicaSetMonitor::get( servers.getSetName() ); } } catch ( DBException& e ) { if ( servers.type() == ConnectionString::SET ) { ReplicaSetMonitor::remove( servers.getSetName() ); } errMsg = str::stream() << "couldn't connect to new shard " << causedBy(e); return false; } // check that none of the existing shard candidate's db's exist elsewhere for ( vector<string>::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) { DBConfigPtr config = getDBConfig( *it , false ); if ( config.get() != NULL ) { ostringstream ss; ss << "can't add shard " << servers.toString() << " because a local database '" << *it; ss << "' exists in another " << config->getPrimary().toString(); errMsg = ss.str(); return false; } } // if a name for a shard wasn't provided, pick one. if ( name->empty() && ! _getNewShardName( name ) ) { errMsg = "error generating new shard name"; return false; } // build the ConfigDB shard document BSONObjBuilder b; b.append(ShardType::name(), *name); b.append(ShardType::host(), rsMonitor ? rsMonitor->getServerAddress() : servers.toString()); if (maxSize > 0) { b.append(ShardType::maxSize(), maxSize); } BSONObj shardDoc = b.obj(); { ScopedDbConnection conn(configServer.getPrimary().getConnString(), 30); // check whether the set of hosts (or single host) is not an already a known shard BSONObj old = conn->findOne(ShardType::ConfigNS, BSON(ShardType::host(servers.toString()))); if ( ! old.isEmpty() ) { errMsg = "host already used"; conn.done(); return false; } log() << "going to add shard: " << shardDoc << endl; conn->insert(ShardType::ConfigNS , shardDoc); errMsg = conn->getLastError(); if ( ! errMsg.empty() ) { log() << "error adding shard: " << shardDoc << " err: " << errMsg << endl; conn.done(); return false; } conn.done(); } Shard::reloadShardInfo(); // add all databases of the new shard for ( vector<string>::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) { DBConfigPtr config = getDBConfig( *it , true , *name ); if ( ! config ) { log() << "adding shard " << servers << " even though could not add database " << *it << endl; } } // Record in changelog BSONObjBuilder shardDetails; shardDetails.append("name", *name); shardDetails.append("host", servers.toString()); configServer.logChange("addShard", "", shardDetails.obj()); return true; }
/** * Outline of the delete process: * 1. Initialize the client for this thread if there is no client. This is for the worker * threads that are attached to any of the threads servicing client requests. * 2. Grant this thread authorization to perform deletes. * 3. Temporarily enable mode to bypass shard version checks. TODO: Replace this hack. * 4. Setup callback to save deletes to moveChunk directory (only if moveParanoia is true). * 5. Delete range. * 6. Wait until the majority of the secondaries catch up. */ bool RangeDeleterDBEnv::deleteRange(OperationContext* txn, const RangeDeleteEntry& taskDetails, long long int* deletedDocs, std::string* errMsg) { const string ns(taskDetails.options.range.ns); const BSONObj inclusiveLower(taskDetails.options.range.minKey); const BSONObj exclusiveUpper(taskDetails.options.range.maxKey); const BSONObj keyPattern(taskDetails.options.range.keyPattern); const WriteConcernOptions writeConcern(taskDetails.options.writeConcern); const bool fromMigrate = taskDetails.options.fromMigrate; const bool onlyRemoveOrphans = taskDetails.options.onlyRemoveOrphanedDocs; const bool initiallyHaveClient = haveClient(); if (!initiallyHaveClient) { Client::initThread("RangeDeleter"); } *deletedDocs = 0; ShardForceVersionOkModeBlock forceVersion; { Helpers::RemoveSaver removeSaver("moveChunk", ns, taskDetails.options.removeSaverReason); Helpers::RemoveSaver* removeSaverPtr = NULL; if (serverGlobalParams.moveParanoia && !taskDetails.options.removeSaverReason.empty()) { removeSaverPtr = &removeSaver; } // log the opId so the user can use it to cancel the delete using killOp. unsigned int opId = txn->getCurOp()->opNum(); log() << "Deleter starting delete for: " << ns << " from " << inclusiveLower << " -> " << exclusiveUpper << ", with opId: " << opId << endl; try { *deletedDocs = Helpers::removeRange(txn, KeyRange(ns, inclusiveLower, exclusiveUpper, keyPattern), false, /*maxInclusive*/ writeConcern, removeSaverPtr, fromMigrate, onlyRemoveOrphans); if (*deletedDocs < 0) { *errMsg = "collection or index dropped before data could be cleaned"; warning() << *errMsg << endl; if (!initiallyHaveClient) { txn->getClient()->shutdown(); } return false; } log() << "rangeDeleter deleted " << *deletedDocs << " documents for " << ns << " from " << inclusiveLower << " -> " << exclusiveUpper << endl; } catch (const DBException& ex) { *errMsg = str::stream() << "Error encountered while deleting range: " << "ns" << ns << " from " << inclusiveLower << " -> " << exclusiveUpper << ", cause by:" << causedBy(ex); if (!initiallyHaveClient) { txn->getClient()->shutdown(); } return false; } } if (!initiallyHaveClient) { txn->getClient()->shutdown(); } return true; }
StatusWith<ShardType> ShardingCatalogManagerImpl::_validateHostAsShard( OperationContext* txn, std::shared_ptr<RemoteCommandTargeter> targeter, const std::string* shardProposedName, const ConnectionString& connectionString) { // Check whether any host in the connection is already part of the cluster. Grid::get(txn)->shardRegistry()->reload(txn); for (const auto& hostAndPort : connectionString.getServers()) { std::shared_ptr<Shard> shard; shard = Grid::get(txn)->shardRegistry()->getShardNoReload(hostAndPort.toString()); if (shard) { return {ErrorCodes::OperationFailed, str::stream() << "'" << hostAndPort.toString() << "' " << "is already a member of the existing shard '" << shard->getConnString().toString() << "' (" << shard->getId() << ")."}; } } // Check for mongos and older version mongod connections, and whether the hosts // can be found for the user specified replset. auto swCommandResponse = _runCommandForAddShard(txn, targeter.get(), "admin", BSON("isMaster" << 1)); if (!swCommandResponse.isOK()) { if (swCommandResponse.getStatus() == ErrorCodes::RPCProtocolNegotiationFailed) { // Mongos to mongos commands are no longer supported in the wire protocol // (because mongos does not support OP_COMMAND), similarly for a new mongos // and an old mongod. So the call will fail in such cases. // TODO: If/When mongos ever supports opCommands, this logic will break because // cmdStatus will be OK. return {ErrorCodes::RPCProtocolNegotiationFailed, str::stream() << targeter->connectionString().toString() << " does not recognize the RPC protocol being used. This is" << " likely because it contains a node that is a mongos or an old" << " version of mongod."}; } else { return swCommandResponse.getStatus(); } } // Check for a command response error auto resIsMasterStatus = std::move(swCommandResponse.getValue().commandStatus); if (!resIsMasterStatus.isOK()) { return {resIsMasterStatus.code(), str::stream() << "Error running isMaster against " << targeter->connectionString().toString() << ": " << causedBy(resIsMasterStatus)}; } auto resIsMaster = std::move(swCommandResponse.getValue().response); // Check whether there is a master. If there isn't, the replica set may not have been // initiated. If the connection is a standalone, it will return true for isMaster. bool isMaster; Status status = bsonExtractBooleanField(resIsMaster, "ismaster", &isMaster); if (!status.isOK()) { return Status(status.code(), str::stream() << "isMaster returned invalid 'ismaster' " << "field when attempting to add " << connectionString.toString() << " as a shard: " << status.reason()); } if (!isMaster) { return {ErrorCodes::NotMaster, str::stream() << connectionString.toString() << " does not have a master. If this is a replica set, ensure that it has a" << " healthy primary and that the set has been properly initiated."}; } const string providedSetName = connectionString.getSetName(); const string foundSetName = resIsMaster["setName"].str(); // Make sure the specified replica set name (if any) matches the actual shard's replica set if (providedSetName.empty() && !foundSetName.empty()) { return {ErrorCodes::OperationFailed, str::stream() << "host is part of set " << foundSetName << "; " << "use replica set url format " << "<setname>/<server1>,<server2>, ..."}; } if (!providedSetName.empty() && foundSetName.empty()) { return {ErrorCodes::OperationFailed, str::stream() << "host did not return a set name; " << "is the replica set still initializing? " << resIsMaster}; } // Make sure the set name specified in the connection string matches the one where its hosts // belong into if (!providedSetName.empty() && (providedSetName != foundSetName)) { return {ErrorCodes::OperationFailed, str::stream() << "the provided connection string (" << connectionString.toString() << ") does not match the actual set name " << foundSetName}; } // Is it a config server? if (resIsMaster.hasField("configsvr")) { return {ErrorCodes::OperationFailed, str::stream() << "Cannot add " << connectionString.toString() << " as a shard since it is a config server"}; } // If the shard is part of a replica set, make sure all the hosts mentioned in the connection // string are part of the set. It is fine if not all members of the set are mentioned in the // connection string, though. if (!providedSetName.empty()) { std::set<string> hostSet; BSONObjIterator iter(resIsMaster["hosts"].Obj()); while (iter.more()) { hostSet.insert(iter.next().String()); // host:port } if (resIsMaster["passives"].isABSONObj()) { BSONObjIterator piter(resIsMaster["passives"].Obj()); while (piter.more()) { hostSet.insert(piter.next().String()); // host:port } } if (resIsMaster["arbiters"].isABSONObj()) { BSONObjIterator piter(resIsMaster["arbiters"].Obj()); while (piter.more()) { hostSet.insert(piter.next().String()); // host:port } } vector<HostAndPort> hosts = connectionString.getServers(); for (size_t i = 0; i < hosts.size(); i++) { const string host = hosts[i].toString(); // host:port if (hostSet.find(host) == hostSet.end()) { return {ErrorCodes::OperationFailed, str::stream() << "in seed list " << connectionString.toString() << ", host " << host << " does not belong to replica set " << foundSetName << "; found " << resIsMaster.toString()}; } } } string actualShardName; if (shardProposedName) { actualShardName = *shardProposedName; } else if (!foundSetName.empty()) { // Default it to the name of the replica set actualShardName = foundSetName; } // Disallow adding shard replica set with name 'config' if (actualShardName == "config") { return {ErrorCodes::BadValue, "use of shard replica set with name 'config' is not allowed"}; } // Retrieve the most up to date connection string that we know from the replica set monitor (if // this is a replica set shard, otherwise it will be the same value as connectionString). ConnectionString actualShardConnStr = targeter->connectionString(); ShardType shard; shard.setName(actualShardName); shard.setHost(actualShardConnStr.toString()); return shard; }
void runThread(ConnectionString& hostConn, unsigned threadId, unsigned seed, BSONObj& cmdObj, BSONObjBuilder& result) { stringstream ss; ss << "thread-" << threadId; setThreadName(ss.str().c_str()); // Lock name string lockName = string_field(cmdObj, "lockName", this->name + "_lock"); // Range of clock skew in diff threads int skewRange = (int) number_field(cmdObj, "skewRange", 1); // How long to wait with the lock int threadWait = (int) number_field(cmdObj, "threadWait", 30); if(threadWait <= 0) threadWait = 1; // Max amount of time (ms) a thread waits before checking the lock again int threadSleep = (int) number_field(cmdObj, "threadSleep", 30); if(threadSleep <= 0) threadSleep = 1; // How long until the lock is forced in ms, only compared locally unsigned long long takeoverMS = (unsigned long long) number_field(cmdObj, "takeoverMS", 0); // Whether or not we should hang some threads int hangThreads = (int) number_field(cmdObj, "hangThreads", 0); boost::mt19937 gen((boost::mt19937::result_type) seed); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSkew(gen, boost::uniform_int<>(0, skewRange)); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomWait(gen, boost::uniform_int<>(1, threadWait)); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSleep(gen, boost::uniform_int<>(1, threadSleep)); int skew = 0; if (!lock.get()) { // Pick a skew, but the first two threads skew the whole range if(threadId == 0) skew = -skewRange / 2; else if(threadId == 1) skew = skewRange / 2; else skew = randomSkew() - (skewRange / 2); // Skew this thread jsTimeVirtualThreadSkew( skew ); log() << "Initializing lock with skew of " << skew << " for thread " << threadId << endl; lock.reset(new DistributedLock(hostConn, lockName, takeoverMS, true )); log() << "Skewed time " << jsTime() << " for thread " << threadId << endl << " max wait (with lock: " << threadWait << ", after lock: " << threadSleep << ")" << endl << " takeover in " << takeoverMS << "(ms remote)" << endl; } DistributedLock* myLock = lock.get(); bool errors = false; BSONObj lockObj; while (keepGoing) { try { if (myLock->lock_try("Testing distributed lock with skew.", false, &lockObj )) { log() << "**** Locked for thread " << threadId << " with ts " << lockObj["ts"] << endl; if( count % 2 == 1 && ! myLock->lock_try( "Testing lock re-entry.", true ) ) { errors = true; log() << "**** !Could not re-enter lock already held" << endl; break; } if( count % 3 == 1 && myLock->lock_try( "Testing lock non-re-entry.", false ) ) { errors = true; log() << "**** !Invalid lock re-entry" << endl; break; } count++; int before = count; int sleep = randomWait(); sleepmillis(sleep); int after = count; if(after != before) { errors = true; log() << "**** !Bad increment while sleeping with lock for: " << sleep << "ms" << endl; break; } // Unlock only half the time... if(hangThreads == 0 || threadId % hangThreads != 0) { log() << "**** Unlocking for thread " << threadId << " with ts " << lockObj["ts"] << endl; myLock->unlock( &lockObj ); } else { log() << "**** Not unlocking for thread " << threadId << endl; DistributedLock::killPinger( *myLock ); // We're simulating a crashed process... break; } } } catch( LockException& e ) { log() << "*** !Could not try distributed lock." << causedBy( e ) << endl; break; } sleepmillis(randomSleep()); } result << "errors" << errors << "skew" << skew << "takeover" << (long long) takeoverMS << "localTimeout" << (takeoverMS > 0); }
DBClientBase* DBClientReplicaSet::callLazy( Message& toSend ) { if ( toSend.operation() == dbQuery ) { // TODO: might be possible to do this faster by changing api DbMessage dm( toSend ); QueryMessage qm( dm ); if ( qm.queryOptions & QueryOption_SlaveOk ) { for ( int i=0; i<3; i++ ) { try { return checkSlave()->callLazy( toSend ); } catch ( DBException &e ) { LOG(1) << "can't callLazy replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl; } } } } return checkMaster()->callLazy( toSend ); }
std::string causedBy( const std::string& e ){ return causedBy( e.c_str() ); }
void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get()); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } BSONObj balancerResult; // use fresh shard state grid.shardRegistry()->reload(txn.get()); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(txn.get()); auto balSettingsResult = grid.catalogManager(txn.get())->getGlobalSettings( txn.get(), SettingsType::BalancerDocKey); const bool isBalSettingsAbsent = balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument; if (!balSettingsResult.isOK() && !isBalSettingsAbsent) { warning() << balSettingsResult.getStatus(); return; } const SettingsType& balancerConfig = isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue(); // now make sure we should even be running if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete() : false); MigrationSecondaryThrottleOptions secondaryThrottle( MigrationSecondaryThrottleOptions::create( MigrationSecondaryThrottleOptions::kDefault)); if (balancerConfig.isKeySet()) { secondaryThrottle = uassertStatusOK(MigrationSecondaryThrottleOptions::createFromBalancerConfig( balancerConfig.toBSON())); } LOG(1) << "*** start balancing round. " << "waitForDelete: " << waitForDelete << ", secondaryThrottle: " << secondaryThrottle.toBSON(); const auto candidateChunks = uassertStatusOK(_getCandidateChunks(txn.get())); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, secondaryThrottle, waitForDelete); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
std::string causedBy( const std::string* e ) { return (e && *e != "") ? causedBy(*e) : ""; }
int Balancer::_moveChunks(OperationContext* txn, const vector<MigrateInfo>& candidateChunks, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { int movedCount = 0; for (const auto& migrateInfo : candidateChunks) { // If the balancer was disabled since we started this round, don't start new chunks // moves. const auto balSettingsResult = grid.catalogManager(txn)->getGlobalSettings(txn, SettingsType::BalancerDocKey); const bool isBalSettingsAbsent = balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument; if (!balSettingsResult.isOK() && !isBalSettingsAbsent) { warning() << balSettingsResult.getStatus(); return movedCount; } const SettingsType& balancerConfig = isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue(); if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "Stopping balancing round early as balancing was disabled"; return movedCount; } // Changes to metadata, borked metadata, and connectivity problems between shards // should cause us to abort this chunk move, but shouldn't cause us to abort the entire // round of chunks. // // TODO(spencer): We probably *should* abort the whole round on issues communicating // with the config servers, but its impossible to distinguish those types of failures // at the moment. // // TODO: Handle all these things more cleanly, since they're expected problems const NamespaceString nss(migrateInfo.ns); try { shared_ptr<DBConfig> cfg = uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString())); // NOTE: We purposely do not reload metadata here, since _getCandidateChunks already // tried to do so once shared_ptr<ChunkManager> cm = cfg->getChunkManager(txn, migrateInfo.ns); uassert(28628, str::stream() << "Collection " << migrateInfo.ns << " was deleted while balancing was active. Aborting balancing round.", cm); ChunkPtr c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min); if (c->getMin().woCompare(migrateInfo.chunk.min) || c->getMax().woCompare(migrateInfo.chunk.max)) { // Likely a split happened somewhere, so force reload the chunk manager cm = cfg->getChunkManager(txn, migrateInfo.ns, true); invariant(cm); c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min); if (c->getMin().woCompare(migrateInfo.chunk.min) || c->getMax().woCompare(migrateInfo.chunk.max)) { log() << "chunk mismatch after reload, ignoring will retry issue " << migrateInfo.chunk.toString(); continue; } } BSONObj res; if (c->moveAndCommit(txn, migrateInfo.to, Chunk::MaxChunkSize, secondaryThrottle, waitForDelete, 0, /* maxTimeMS */ res)) { movedCount++; continue; } // The move requires acquiring the collection metadata's lock, which can fail. log() << "balancer move failed: " << res << " from: " << migrateInfo.from << " to: " << migrateInfo.to << " chunk: " << migrateInfo.chunk; Status moveStatus = getStatusFromCommandResult(res); if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) { // Reload just to be safe cm = cfg->getChunkManager(txn, migrateInfo.ns); invariant(cm); c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min); log() << "performing a split because migrate failed for size reasons"; Status status = c->split(txn, Chunk::normal, NULL, NULL); log() << "split results: " << status; if (!status.isOK()) { log() << "marking chunk as jumbo: " << c->toString(); c->markAsJumbo(txn); // We increment moveCount so we do another round right away movedCount++; } } } catch (const DBException& ex) { warning() << "could not move chunk " << migrateInfo.chunk.toString() << ", continuing balancing round" << causedBy(ex); } } return movedCount; }
std::string causedBy( const Status& e ){ return causedBy( e.reason() ); }
bool ClientInfo::getLastError( const string& dbName, const BSONObj& options, BSONObjBuilder& result, string& errmsg, bool fromWriteBackListener) { set<string> * shards = getPrev(); if ( shards->size() == 0 ) { result.appendNull( "err" ); return true; } vector<WBInfo> writebacks; // // TODO: These branches should be collapsed into a single codepath // // handle single server if ( shards->size() == 1 ) { string theShard = *(shards->begin() ); BSONObj res; bool ok = false; { LOG(5) << "gathering response for gle from: " << theShard << endl; ShardConnection conn( theShard , "" ); try { ok = conn->runCommand( dbName , options , res ); } catch( std::exception &e ) { string message = str::stream() << "could not get last error from shard " << theShard << causedBy( e ); warning() << message << endl; errmsg = message; // Catch everything that happens here, since we need to ensure we return our connection when we're // finished. conn.done(); return false; } res = res.getOwned(); conn.done(); } _addWriteBack( writebacks, res, true ); LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for" << " gle (" << theShard << ")" << endl; // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( temp == theShard ) continue; LOG(5) << "gathering writebacks for single-shard gle from: " << temp << endl; try { ShardConnection conn( temp , "" ); ON_BLOCK_EXIT_OBJ( conn, &ShardConnection::done ); _addWriteBack( writebacks, conn->getLastErrorDetailed(), false ); } catch( std::exception &e ){ warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl; } } clearSinceLastGetError(); LOG(4) << "checking " << writebacks.size() << " writebacks for" << " gle (" << theShard << ")" << endl; if ( writebacks.size() ){ vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener ); if ( v.size() == 0 && fromWriteBackListener ) { // ok } else { // this will usually be 1 // it can be greater than 1 if a write to a different shard // than the last write op had a writeback // all we're going to report is the first // since that's the current write // but we block for all verify( v.size() >= 1 ); if ( res["writebackSince"].numberInt() > 0 ) { // got writeback from older op // ignore the result from it, just needed to wait result.appendElements( res ); } else if ( writebacks[0].fromLastOperation ) { result.appendElements( v[0] ); result.appendElementsUnique( res ); result.append( "writebackGLE" , v[0] ); result.append( "initialGLEHost" , theShard ); result.append( "initialGLE", res ); } else { // there was a writeback // but its from an old operations // so all that's important is that we block, not that we return stats result.appendElements( res ); } } } else { result.append( "singleShard" , theShard ); result.appendElements( res ); } return ok; } BSONArrayBuilder bbb( result.subarrayStart( "shards" ) ); BSONObjBuilder shardRawGLE; long long n = 0; int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true // hit each shard vector<string> errors; vector<BSONObj> errorObjects; for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) { string theShard = *i; bbb.append( theShard ); LOG(5) << "gathering a response for gle from: " << theShard << endl; boost::scoped_ptr<ShardConnection> conn; BSONObj res; bool ok = false; try { conn.reset( new ShardConnection( theShard , "" ) ); // constructor can throw if shard is down ok = (*conn)->runCommand( dbName , options , res ); shardRawGLE.append( theShard , res ); } catch( std::exception &e ){ // Safe to return here, since we haven't started any extra processing yet, just collecting // responses. string message = str::stream() << "could not get last error from a shard " << theShard << causedBy( e ); warning() << message << endl; errmsg = message; if (conn) conn->done(); return false; } _addWriteBack( writebacks, res, true ); string temp = DBClientWithCommands::getLastErrorString( res ); if ( (*conn)->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) { errors.push_back( temp ); errorObjects.push_back( res ); } n += res["n"].numberLong(); if ( res["updatedExisting"].type() ) { if ( res["updatedExisting"].trueValue() ) updatedExistingStat = 1; else if ( updatedExistingStat == 0 ) updatedExistingStat = -1; } conn->done(); } bbb.done(); result.append( "shardRawGLE" , shardRawGLE.obj() ); result.appendNumber( "n" , n ); if ( updatedExistingStat ) result.appendBool( "updatedExisting" , updatedExistingStat > 0 ); LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for" << " gle (" << shards->size() << " shards)" << endl; // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( shards->count( temp ) ) continue; LOG(5) << "gathering writebacks for multi-shard gle from: " << temp << endl; ShardConnection conn( temp , "" ); try { _addWriteBack( writebacks, conn->getLastErrorDetailed(), false ); } catch( std::exception &e ){ warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl; } conn.done(); } clearSinceLastGetError(); LOG(4) << "checking " << writebacks.size() << " writebacks for" << " gle (" << shards->size() << " shards)" << endl; if ( errors.size() == 0 ) { result.appendNull( "err" ); _handleWriteBacks( writebacks , fromWriteBackListener ); return true; } result.append( "err" , errors[0].c_str() ); { // errs BSONArrayBuilder all( result.subarrayStart( "errs" ) ); for ( unsigned i=0; i<errors.size(); i++ ) { all.append( errors[i].c_str() ); } all.done(); } { // errObjects BSONArrayBuilder all( result.subarrayStart( "errObjects" ) ); for ( unsigned i=0; i<errorObjects.size(); i++ ) { all.append( errorObjects[i] ); } all.done(); } _handleWriteBacks( writebacks , fromWriteBackListener ); return true; }
void Socket::handleSendError(int ret, const char* context) { #if defined(_WIN32) const int mongo_errno = WSAGetLastError(); if (mongo_errno == WSAETIMEDOUT && _timeout != 0) { #else const int mongo_errno = errno; if ((mongo_errno == EAGAIN || mongo_errno == EWOULDBLOCK) && _timeout != 0) { #endif LOG(_logLevel) << "Socket " << context << " send() timed out " << remoteString(); throw SocketException(SocketException::SEND_TIMEOUT, remoteString()); } else if (mongo_errno != EINTR) { LOG(_logLevel) << "Socket " << context << " send() " << errnoWithDescription(mongo_errno) << ' ' << remoteString(); throw SocketException(SocketException::SEND_ERROR, remoteString()); } } void Socket::handleRecvError(int ret, int len) { if (ret == 0) { LOG(3) << "Socket recv() conn closed? " << remoteString(); throw SocketException(SocketException::CLOSED, remoteString()); } // ret < 0 #if defined(_WIN32) int e = WSAGetLastError(); #else int e = errno; #if defined(EINTR) if (e == EINTR) { return; } #endif #endif #if defined(_WIN32) // Windows if ((e == EAGAIN || e == WSAETIMEDOUT) && _timeout > 0) { #else if (e == EAGAIN && _timeout > 0) { #endif // this is a timeout LOG(_logLevel) << "Socket recv() timeout " << remoteString(); throw SocketException(SocketException::RECV_TIMEOUT, remoteString()); } LOG(_logLevel) << "Socket recv() " << errnoWithDescription(e) << " " << remoteString(); throw SocketException(SocketException::RECV_ERROR, remoteString()); } void Socket::setTimeout(double secs) { setSockTimeouts(_fd, secs); } // TODO: allow modification? // // <positive value> : secs to wait between stillConnected checks // 0 : always check // -1 : never check const int Socket::errorPollIntervalSecs(5); // Patch to allow better tolerance of flaky network connections that get broken // while we aren't looking. // TODO: Remove when better async changes come. // // isStillConnected() polls the socket at max every Socket::errorPollIntervalSecs to determine // if any disconnection-type events have happened on the socket. bool Socket::isStillConnected() { if (_fd == INVALID_SOCKET) { // According to the man page, poll will respond with POLLVNAL for invalid or // unopened descriptors, but it doesn't seem to be properly implemented in // some platforms - it can return 0 events and 0 for revent. Hence this workaround. return false; } if (errorPollIntervalSecs < 0) return true; if (!isPollSupported()) return true; // nothing we can do time_t now = time(0); time_t idleTimeSecs = now - _lastValidityCheckAtSecs; // Only check once every 5 secs if (idleTimeSecs < errorPollIntervalSecs) return true; // Reset our timer, we're checking the connection _lastValidityCheckAtSecs = now; // It's been long enough, poll to see if our socket is still connected pollfd pollInfo; pollInfo.fd = _fd; // We only care about reading the EOF message on clean close (and errors) pollInfo.events = POLLIN; // Poll( info[], size, timeout ) - timeout == 0 => nonblocking int nEvents = socketPoll(&pollInfo, 1, 0); LOG(2) << "polling for status of connection to " << remoteString() << ", " << (nEvents == 0 ? "no events" : nEvents == -1 ? "error detected" : "event detected"); if (nEvents == 0) { // No events incoming, return still connected AFAWK return true; } else if (nEvents < 0) { // Poll itself failed, this is weird, warn and log errno warning() << "Socket poll() failed during connectivity check" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << causedBy(errnoWithDescription()); // Return true since it's not clear that we're disconnected. return true; } dassert(nEvents == 1); dassert(pollInfo.revents > 0); // Return false at this point, some event happened on the socket, but log what the // actual event was. if (pollInfo.revents & POLLIN) { // There shouldn't really be any data to recv here, so make sure this // is a clean hangup. const int testBufLength = 1024; char testBuf[testBufLength]; int recvd = ::recv(_fd, testBuf, testBufLength, portRecvFlags); if (recvd < 0) { // An error occurred during recv, warn and log errno warning() << "Socket recv() failed during connectivity check" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << causedBy(errnoWithDescription()); } else if (recvd > 0) { // We got nonzero data from this socket, very weird? // Log and warn at runtime, log and abort at devtime // TODO: Dump the data to the log somehow? error() << "Socket found pending " << recvd << " bytes of data during connectivity check" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")"; DEV { std::string hex = hexdump(testBuf, recvd); error() << "Hex dump of stale log data: " << hex; } dassert(false); } else { // recvd == 0, socket closed remotely, just return false LOG(0) << "Socket closed remotely, no longer connected" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")"; } } else if (pollInfo.revents & POLLHUP) {
BSONObj DBClientReplicaSet::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) { if ( queryOptions & QueryOption_SlaveOk ) { // we're ok sending to a slave // we'll try 2 slaves before just using master // checkSlave will try a different slave automatically after a failure for ( int i=0; i<3; i++ ) { try { return checkSlave()->findOne(ns,query,fieldsToReturn,queryOptions); } catch ( DBException &e ) { LOG(1) << "can't findone replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl; } } } return checkMaster()->findOne(ns,query,fieldsToReturn,queryOptions); }
virtual void process( Message& m , AbstractMessagingPort* p , LastError * le) { verify( p ); Request r( m , p ); verify( le ); lastError.startRequest( m , le ); try { r.init(); r.process(); } catch ( AssertionException & e ) { log( e.isUserAssertion() ? 1 : 0 ) << "AssertionException while processing op type : " << m.operation() << " to : " << r.getns() << causedBy(e) << endl; le->raiseError( e.getCode() , e.what() ); m.header()->id = r.id(); if ( r.expectResponse() ) { BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() ); replyToQuery( ResultFlag_ErrSet, p , m , err ); } } catch ( DBException& e ) { log() << "DBException in process: " << e.what() << endl; le->raiseError( e.getCode() , e.what() ); m.header()->id = r.id(); if ( r.expectResponse() ) { BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() ); replyToQuery( ResultFlag_ErrSet, p , m , err ); } } }
bool DBClientReplicaSet::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { if ( toSend.operation() == dbQuery ) { // TODO: might be possible to do this faster by changing api DbMessage dm( toSend ); QueryMessage qm( dm ); if ( qm.queryOptions & QueryOption_SlaveOk ) { for ( int i=0; i<3; i++ ) { try { DBClientConnection* s = checkSlave(); if ( actualServer ) *actualServer = s->getServerAddress(); return s->call( toSend , response , assertOk ); } catch ( DBException &e ) { LOG(1) << "can't call replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl; if ( actualServer ) *actualServer = ""; } } } } DBClientConnection* m = checkMaster(); if ( actualServer ) *actualServer = m->getServerAddress(); return m->call( toSend , response , assertOk ); }
void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get(), false); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } // Use fresh shard state and balancer settings Grid::get(txn.get())->shardRegistry()->reload(txn.get()); auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); sleepFor(balanceRoundInterval); continue; } // now make sure we should even be running if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, balancerConfig->getSecondaryThrottle(), balancerConfig->waitForDelete()); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
// Semantics of this method are basically that if the lock cannot be acquired, returns false, // can be retried. If the lock should not be tried again (some unexpected error), // a LockException is thrown. bool DistributedLock::lock_try(const OID& lockID, const string& why, BSONObj* other, double timeout) { // This should always be true, if not, we are using the lock incorrectly. verify(_name != ""); auto lockTimeout = _lockTimeout; MONGO_FAIL_POINT_BLOCK(setSCCCDistLockTimeout, customTimeout) { const BSONObj& data = customTimeout.getData(); lockTimeout = data["timeoutMs"].numberInt(); } LOG(logLvl) << "trying to acquire new distributed lock for " << _name << " on " << _conn << " ( lock timeout : " << lockTimeout << ", ping interval : " << _lockPing << ", process : " << _processId << " )" << endl; // write to dummy if 'other' is null BSONObj dummyOther; if (other == NULL) other = &dummyOther; ScopedDbConnection conn(_conn.toString(), timeout); BSONObjBuilder queryBuilder; queryBuilder.append(LocksType::name(), _name); queryBuilder.append(LocksType::state(), LocksType::UNLOCKED); { // make sure its there so we can use simple update logic below BSONObj o = conn->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name))).getOwned(); // Case 1: No locks if (o.isEmpty()) { try { LOG(logLvl) << "inserting initial doc in " << LocksType::ConfigNS << " for lock " << _name << endl; conn->insert(LocksType::ConfigNS, BSON(LocksType::name(_name) << LocksType::state(LocksType::UNLOCKED) << LocksType::who("") << LocksType::lockID(OID()))); } catch (UserException& e) { warning() << "could not insert initial doc for distributed lock " << _name << causedBy(e) << endl; } } // Case 2: A set lock that we might be able to force else if (o[LocksType::state()].numberInt() > LocksType::UNLOCKED) { string lockName = o[LocksType::name()].String() + string("/") + o[LocksType::process()].String(); BSONObj lastPing = conn->findOne( LockpingsType::ConfigNS, o[LocksType::process()].wrap(LockpingsType::process())); if (lastPing.isEmpty()) { LOG(logLvl) << "empty ping found for process in lock '" << lockName << "'" << endl; // TODO: Using 0 as a "no time found" value Will fail if dates roll over, but then, // so will a lot. lastPing = BSON(LockpingsType::process(o[LocksType::process()].String()) << LockpingsType::ping(Date_t())); } unsigned long long elapsed = 0; unsigned long long takeover = lockTimeout; DistLockPingInfo lastPingEntry = getLastPing(); LOG(logLvl) << "checking last ping for lock '" << lockName << "' against process " << lastPingEntry.processId << " and ping " << lastPingEntry.lastPing; try { Date_t remote = remoteTime(_conn); auto pingDocProcessId = lastPing[LockpingsType::process()].String(); auto pingDocPingValue = lastPing[LockpingsType::ping()].Date(); // Timeout the elapsed time using comparisons of remote clock // For non-finalized locks, timeout 15 minutes since last seen (ts) // For finalized locks, timeout 15 minutes since last ping bool recPingChange = o[LocksType::state()].numberInt() == LocksType::LOCKED && (lastPingEntry.processId != pingDocProcessId || lastPingEntry.lastPing != pingDocPingValue); bool recTSChange = lastPingEntry.lockSessionId != o[LocksType::lockID()].OID(); if (recPingChange || recTSChange) { // If the ping has changed since we last checked, mark the current date and time setLastPing(DistLockPingInfo(pingDocProcessId, pingDocPingValue, remote, o[LocksType::lockID()].OID(), OID())); } else { // GOTCHA! Due to network issues, it is possible that the current time // is less than the remote time. We *have* to check this here, otherwise // we overflow and our lock breaks. if (lastPingEntry.configLocalTime >= remote) elapsed = 0; else elapsed = durationCount<Milliseconds>(remote - lastPingEntry.configLocalTime); } } catch (LockException& e) { // Remote server cannot be found / is not responsive warning() << "Could not get remote time from " << _conn << causedBy(e); // If our config server is having issues, forget all the pings until we can see it // again resetLastPing(); } if (elapsed <= takeover) { LOG(1) << "could not force lock '" << lockName << "' because elapsed time " << elapsed << " <= takeover time " << takeover; *other = o; other->getOwned(); conn.done(); return false; } LOG(0) << "forcing lock '" << lockName << "' because elapsed time " << elapsed << " > takeover time " << takeover; if (elapsed > takeover) { // Lock may forced, reset our timer if succeeds or fails // Ensures that another timeout must happen if something borks up here, and resets // our pristine ping state if acquired. resetLastPing(); try { // Check the clock skew again. If we check this before we get a lock // and after the lock times out, we can be pretty sure the time is // increasing at the same rate on all servers and therefore our // timeout is accurate if (isRemoteTimeSkewed()) { string msg(str::stream() << "remote time in cluster " << _conn.toString() << " is now skewed, cannot force lock."); throw LockException(msg, ErrorCodes::DistributedClockSkewed); } // Make sure we break the lock with the correct "ts" (OID) value, otherwise // we can overwrite a new lock inserted in the meantime. conn->update(LocksType::ConfigNS, BSON(LocksType::name(_name) << LocksType::state() << o[LocksType::state()].numberInt() << LocksType::lockID(o[LocksType::lockID()].OID())), BSON("$set" << BSON(LocksType::state(LocksType::UNLOCKED)))); BSONObj err = conn->getLastErrorDetailed(); string errMsg = DBClientWithCommands::getLastErrorString(err); // TODO: Clean up all the extra code to exit this method, probably with a // refactor if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) { logErrMsgOrWarn( "Could not force lock", lockName, errMsg, "(another force won"); *other = o; other->getOwned(); conn.done(); return false; } } catch (UpdateNotTheSame&) { // Ok to continue since we know we forced at least one lock document, and all // lock docs are required for a lock to be held. warning() << "lock forcing " << lockName << " inconsistent" << endl; } catch (const LockException&) { // Let the exception go up and don't repackage the exception. throw; } catch (std::exception& e) { conn.done(); string msg(str::stream() << "exception forcing distributed lock " << lockName << causedBy(e)); throw LockException(msg, 13660); } } else { // Not strictly necessary, but helpful for small timeouts where thread // scheduling is significant. This ensures that two attempts are still // required for a force if not acquired, and resets our state if we // are acquired. resetLastPing(); // Test that the lock is held by trying to update the finalized state of the lock to // the same state if it does not update or does not update on all servers, we can't // re-enter. try { // Test the lock with the correct "ts" (OID) value conn->update(LocksType::ConfigNS, BSON(LocksType::name(_name) << LocksType::state(LocksType::LOCKED) << LocksType::lockID(o[LocksType::lockID()].OID())), BSON("$set" << BSON(LocksType::state(LocksType::LOCKED)))); BSONObj err = conn->getLastErrorDetailed(); string errMsg = DBClientWithCommands::getLastErrorString(err); // TODO: Clean up all the extra code to exit this method, probably with a // refactor if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) { logErrMsgOrWarn( "Could not re-enter lock", lockName, errMsg, "(not sure lock is held"); *other = o; other->getOwned(); conn.done(); return false; } } catch (UpdateNotTheSame&) { // NOT ok to continue since our lock isn't held by all servers, so isn't valid. warning() << "inconsistent state re-entering lock, lock " << lockName << " not held" << endl; *other = o; other->getOwned(); conn.done(); return false; } catch (std::exception& e) { conn.done(); string msg(str::stream() << "exception re-entering distributed lock " << lockName << causedBy(e)); throw LockException(msg, 13660); } LOG(logLvl - 1) << "re-entered distributed lock '" << lockName << "'" << endl; *other = o.getOwned(); conn.done(); return true; } LOG(logLvl - 1) << "lock '" << lockName << "' successfully forced" << endl; // We don't need the ts value in the query, since we will only ever replace locks with // state=0. } // Case 3: We have an expired lock else if (o[LocksType::lockID()].type()) { queryBuilder.append(o[LocksType::lockID()]); } } // Always reset our ping if we're trying to get a lock, since getting a lock implies the lock // state is open and no locks need to be forced. If anything goes wrong, we don't want to // remember an old lock. resetLastPing(); bool gotLock = false; BSONObj currLock; BSONObj lockDetails = BSON(LocksType::state(LocksType::LOCK_PREP) << LocksType::who(getDistLockId()) << LocksType::process(_processId) << LocksType::when(jsTime()) << LocksType::why(why) << LocksType::lockID(lockID)); BSONObj whatIWant = BSON("$set" << lockDetails); BSONObj query = queryBuilder.obj(); string lockName = _name + string("/") + _processId; try { // Main codepath to acquire lock LOG(logLvl) << "about to acquire distributed lock '" << lockName << "'"; LOG(logLvl + 1) << "trying to acquire lock " << query.toString(false, true) << " with details " << lockDetails.toString(false, true) << endl; conn->update(LocksType::ConfigNS, query, whatIWant); BSONObj err = conn->getLastErrorDetailed(); string errMsg = DBClientWithCommands::getLastErrorString(err); currLock = conn->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name))); if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) { logErrMsgOrWarn("could not acquire lock", lockName, errMsg, "(another update won)"); *other = currLock; other->getOwned(); gotLock = false; } else { gotLock = true; } } catch (UpdateNotTheSame& up) { // this means our update got through on some, but not others warning() << "distributed lock '" << lockName << " did not propagate properly." << causedBy(up) << endl; // Overall protection derives from: // All unlocking updates use the ts value when setting state to 0 // This ensures that during locking, we can override all smaller ts locks with // our own safe ts value and not be unlocked afterward. for (unsigned i = 0; i < up.size(); i++) { ScopedDbConnection indDB(up[i].first); BSONObj indUpdate; try { indUpdate = indDB->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name))); const auto currentLockID = indUpdate[LocksType::lockID()].OID(); // If we override this lock in any way, grab and protect it. // We assume/ensure that if a process does not have all lock documents, it is no // longer holding the lock. // Note - finalized locks may compete too, but we know they've won already if // competing in this round. Cleanup of crashes during finalizing may take a few // tries. if (currentLockID < lockID || indUpdate[LocksType::state()].numberInt() == LocksType::UNLOCKED) { BSONObj grabQuery = BSON(LocksType::name(_name) << LocksType::lockID(currentLockID)); // Change ts so we won't be forced, state so we won't be relocked BSONObj grabChanges = BSON(LocksType::lockID(lockID) << LocksType::state(LocksType::LOCK_PREP)); // Either our update will succeed, and we'll grab the lock, or it will fail b/c // some other process grabbed the lock (which will change the ts), but the lock // will be set until forcing indDB->update(LocksType::ConfigNS, grabQuery, BSON("$set" << grabChanges)); indUpdate = indDB->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name))); // The tournament was interfered and it is not safe to proceed further. // One case this could happen is when the LockPinger processes old // entries from addUnlockOID. See SERVER-10688 for more detailed // description of race. if (indUpdate[LocksType::state()].numberInt() <= LocksType::UNLOCKED) { LOG(logLvl - 1) << "lock tournament interrupted, " << "so no lock was taken; " << "new state of lock: " << indUpdate << endl; // We now break and set our currLock lockID value to zero, so that // we know that we did not acquire the lock below. Later code will // cleanup failed entries. currLock = BSON(LocksType::lockID(OID())); indDB.done(); break; } } // else our lock is the same, in which case we're safe, or it's a bigger lock, // in which case we won't need to protect anything since we won't have the lock. } catch (std::exception& e) { conn.done(); string msg(str::stream() << "distributed lock " << lockName << " had errors communicating with individual server " << up[1].first << causedBy(e)); throw LockException(msg, 13661, lockID); } verify(!indUpdate.isEmpty()); // Find max TS value if (currLock.isEmpty() || currLock[LocksType::lockID()] < indUpdate[LocksType::lockID()]) { currLock = indUpdate.getOwned(); } indDB.done(); } // Locks on all servers are now set and safe until forcing if (currLock[LocksType::lockID()].OID() == lockID) { LOG(logLvl - 1) << "lock update won, completing lock propagation for '" << lockName << "'" << endl; gotLock = true; } else { LOG(logLvl - 1) << "lock update lost, lock '" << lockName << "' not propagated." << endl; gotLock = false; } } catch (std::exception& e) { conn.done(); string msg(str::stream() << "exception creating distributed lock " << lockName << causedBy(e)); throw LockException(msg, 13663, lockID); } // Complete lock propagation if (gotLock) { // This is now safe, since we know that no new locks will be placed on top of the ones we've // checked for at least 15 minutes. Sets the state = 2, so that future clients can // determine that the lock is truly set. The invariant for rollbacks is that we will never // force locks with state = 2 and active pings, since that indicates the lock is active, but // this means the process creating/destroying them must explicitly poll when something goes // wrong. try { BSONObjBuilder finalLockDetails; BSONObjIterator bi(lockDetails); while (bi.more()) { BSONElement el = bi.next(); if ((string)(el.fieldName()) == LocksType::state()) finalLockDetails.append(LocksType::state(), LocksType::LOCKED); else finalLockDetails.append(el); } conn->update(LocksType::ConfigNS, BSON(LocksType::name(_name)), BSON("$set" << finalLockDetails.obj())); BSONObj err = conn->getLastErrorDetailed(); string errMsg = DBClientWithCommands::getLastErrorString(err); currLock = conn->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name))); if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) { warning() << "could not finalize winning lock " << lockName << (!errMsg.empty() ? causedBy(errMsg) : " (did not update lock) ") << endl; gotLock = false; } else { // SUCCESS! gotLock = true; } } catch (std::exception& e) { conn.done(); string msg(str::stream() << "exception finalizing winning lock" << causedBy(e)); // Inform caller about the potential orphan lock. throw LockException(msg, 13662, lockID); } } *other = currLock; other->getOwned(); // Log our lock results if (gotLock) LOG(logLvl - 1) << "distributed lock '" << lockName << "' acquired for '" << why << "', ts : " << currLock[LocksType::lockID()].OID(); else LOG(logLvl - 1) << "distributed lock '" << lockName << "' was not acquired."; conn.done(); return gotLock; }
int Balancer::_moveChunks(OperationContext* txn, const BalancerChunkSelectionPolicy::MigrateInfoVector& candidateChunks, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { int movedCount = 0; for (const auto& migrateInfo : candidateChunks) { // If the balancer was disabled since we started this round, don't start new chunks // moves. if (!Grid::get(txn)->getBalancerConfiguration()->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "Stopping balancing round early as balancing was disabled"; return movedCount; } // Changes to metadata, borked metadata, and connectivity problems between shards // should cause us to abort this chunk move, but shouldn't cause us to abort the entire // round of chunks. // // TODO(spencer): We probably *should* abort the whole round on issues communicating // with the config servers, but its impossible to distinguish those types of failures // at the moment. // // TODO: Handle all these things more cleanly, since they're expected problems const NamespaceString nss(migrateInfo.ns); try { auto scopedCM = uassertStatusOK(ScopedChunkManager::getExisting(txn, nss)); ChunkManager* const cm = scopedCM.cm(); shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, migrateInfo.minKey); if (c->getMin().woCompare(migrateInfo.minKey) || c->getMax().woCompare(migrateInfo.maxKey)) { log() << "Migration " << migrateInfo << " will be skipped this round due to chunk metadata mismatch."; scopedCM.db()->getChunkManager(txn, nss.ns(), true); continue; } BSONObj res; if (c->moveAndCommit(txn, migrateInfo.to, Grid::get(txn)->getBalancerConfiguration()->getMaxChunkSizeBytes(), secondaryThrottle, waitForDelete, 0, /* maxTimeMS */ res)) { movedCount++; continue; } log() << "balancer move failed: " << res << ", migrate: " << migrateInfo; Status moveStatus = getStatusFromCommandResult(res); if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) { log() << "Performing a split because migrate failed for size reasons"; auto splitStatus = c->split(txn, Chunk::normal, NULL); if (!splitStatus.isOK()) { log() << "marking chunk as jumbo: " << c->toString(); c->markAsJumbo(txn); // We increment moveCount so we do another round right away movedCount++; } } } catch (const DBException& ex) { log() << "balancer move " << migrateInfo << " failed" << causedBy(ex); } } return movedCount; }
bool ClientInfo::getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener ) { set<string> * shards = getPrev(); if ( shards->size() == 0 ) { result.appendNull( "err" ); return true; } vector<WBInfo> writebacks; // handle single server if ( shards->size() == 1 ) { string theShard = *(shards->begin() ); ShardConnection conn( theShard , "", true ); BSONObj res; bool ok = false; try{ ok = conn->runCommand( "admin" , options , res ); } catch( std::exception &e ){ warning() << "could not get last error." << causedBy( e ) << endl; // Catch everything that happens here, since we need to ensure we return our connection when we're // finished. conn.done(); return false; } res = res.getOwned(); conn.done(); _addWriteBack( writebacks , res ); // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( temp == theShard ) continue; ShardConnection conn( temp , "" ); _addWriteBack( writebacks , conn->getLastErrorDetailed() ); conn.done(); } clearSinceLastGetError(); if ( writebacks.size() ){ vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener ); if ( v.size() == 0 && fromWriteBackListener ) { // ok } else { assert( v.size() == 1 ); result.appendElements( v[0] ); result.appendElementsUnique( res ); result.append( "writebackGLE" , v[0] ); result.append( "initialGLEHost" , theShard ); } } else { result.append( "singleShard" , theShard ); result.appendElements( res ); } return ok; } BSONArrayBuilder bbb( result.subarrayStart( "shards" ) ); BSONObjBuilder shardRawGLE; long long n = 0; int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true // hit each shard vector<string> errors; vector<BSONObj> errorObjects; for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) { string theShard = *i; bbb.append( theShard ); ShardConnection conn( theShard , "", true ); BSONObj res; bool ok = false; try { ok = conn->runCommand( "admin" , options , res ); shardRawGLE.append( theShard , res ); } catch( std::exception &e ){ // Safe to return here, since we haven't started any extra processing yet, just collecting // responses. warning() << "could not get last error." << causedBy( e ) << endl; conn.done(); return false; } _addWriteBack( writebacks, res ); string temp = DBClientWithCommands::getLastErrorString( res ); if ( conn->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) { errors.push_back( temp ); errorObjects.push_back( res ); } n += res["n"].numberLong(); if ( res["updatedExisting"].type() ) { if ( res["updatedExisting"].trueValue() ) updatedExistingStat = 1; else if ( updatedExistingStat == 0 ) updatedExistingStat = -1; } conn.done(); } bbb.done(); result.append( "shardRawGLE" , shardRawGLE.obj() ); result.appendNumber( "n" , n ); if ( updatedExistingStat ) result.appendBool( "updatedExisting" , updatedExistingStat > 0 ); // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( shards->count( temp ) ) continue; ShardConnection conn( temp , "" ); _addWriteBack( writebacks, conn->getLastErrorDetailed() ); conn.done(); } clearSinceLastGetError(); if ( errors.size() == 0 ) { result.appendNull( "err" ); _handleWriteBacks( writebacks , fromWriteBackListener ); return true; } result.append( "err" , errors[0].c_str() ); { // errs BSONArrayBuilder all( result.subarrayStart( "errs" ) ); for ( unsigned i=0; i<errors.size(); i++ ) { all.append( errors[i].c_str() ); } all.done(); } { // errObjects BSONArrayBuilder all( result.subarrayStart( "errObjects" ) ); for ( unsigned i=0; i<errorObjects.size(); i++ ) { all.append( errorObjects[i] ); } all.done(); } _handleWriteBacks( writebacks , fromWriteBackListener ); return true; }
int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks, bool secondaryThrottle, bool waitForDelete) { int movedCount = 0; for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) { const CandidateChunk& chunkInfo = *it->get(); // Changes to metadata, borked metadata, and connectivity problems should cause us to // abort this chunk move, but shouldn't cause us to abort the entire round of chunks. // TODO: Handle all these things more cleanly, since they're expected problems try { DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns ); verify( cfg ); // NOTE: We purposely do not reload metadata here, since _doBalanceRound already // tried to do so once. ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { // likely a split happened somewhere cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */); verify( cm ); c = cm->findIntersectingChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl; continue; } } BSONObj res; if (c->moveAndCommit(Shard::make(chunkInfo.to), Chunk::MaxChunkSize, secondaryThrottle, waitForDelete, res)) { movedCount++; continue; } // the move requires acquiring the collection metadata's lock, which can fail log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to << " chunk: " << chunkInfo.chunk << endl; if ( res["chunkTooBig"].trueValue() ) { // reload just to be safe cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); c = cm->findIntersectingChunk( chunkInfo.chunk.min ); log() << "forcing a split because migrate failed for size reasons" << endl; res = BSONObj(); c->singleSplit( true , res ); log() << "forced split results: " << res << endl; if ( ! res["ok"].trueValue() ) { log() << "marking chunk as jumbo: " << c->toString() << endl; c->markAsJumbo(); // we increment moveCount so we do another round right away movedCount++; } } } catch( const DBException& ex ) { warning() << "could not move chunk " << chunkInfo.chunk.toString() << ", continuing balancing round" << causedBy( ex ) << endl; } } return movedCount; }