Status MigrationChunkClonerSourceLegacy::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); // Resolve the donor and recipient shards and their connection string { auto donorShardStatus = grid.shardRegistry()->getShard(txn, _args.getFromShardId()); if (!donorShardStatus.isOK()) { return donorShardStatus.getStatus(); } _donorCS = donorShardStatus.getValue()->getConnString(); } { auto recipientShardStatus = grid.shardRegistry()->getShard(txn, _args.getToShardId()); if (!recipientShardStatus.isOK()) { return recipientShardStatus.getStatus(); } auto recipientShard = recipientShardStatus.getValue(); auto shardHostStatus = recipientShard->getTargeter()->findHost( ReadPreferenceSetting{ReadPreference::PrimaryOnly}); if (!shardHostStatus.isOK()) { return shardHostStatus.getStatus(); } _recipientHost = std::move(shardHostStatus.getValue()); } // Prepare the currently available documents Status status = _storeCurrentLocs(txn); if (!status.isOK()) { return status; } // Tell the recipient shard to start cloning BSONObjBuilder cmdBuilder; StartChunkCloneRequest::appendAsCommand(&cmdBuilder, _args.getNss(), _sessionId, _args.getConfigServerCS(), _donorCS, _args.getFromShardId(), _args.getToShardId(), _args.getMinKey(), _args.getMaxKey(), _shardKeyPattern.toBSON(), _args.getSecondaryThrottle()); auto responseStatus = _callRecipient(cmdBuilder.obj()); if (!responseStatus.isOK()) { return responseStatus.getStatus(); } scopedGuard.Dismiss(); return Status::OK(); }
Status MigrationChunkClonerSourceLegacy::commitClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); { stdx::lock_guard<stdx::mutex> sl(_mutex); invariant(!_cloneCompleted); } auto responseStatus = _callRecipient(createRecvChunkCommitRequest(_args.getNss(), _sessionId)); if (responseStatus.isOK()) { _cleanup(txn); return Status::OK(); } cancelClone(txn); return responseStatus.getStatus(); }
Status MigrationChunkClonerSourceLegacy::awaitUntilCriticalSectionIsAppropriate( OperationContext* txn, Milliseconds maxTimeToWait) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); const auto startTime = Date_t::now(); int iteration = 0; while ((Date_t::now() - startTime) < maxTimeToWait) { // Exponential sleep backoff, up to 1024ms. Don't sleep much on the first few iterations, // since we want empty chunk migrations to be fast. sleepmillis(1 << std::min(iteration, 10)); iteration++; auto responseStatus = _callRecipient(BSON(kRecvChunkStatus << _args.getNss().ns())); if (!responseStatus.isOK()) { return {responseStatus.getStatus().code(), str::stream() << "Failed to contact recipient shard to monitor data transfer due to " << responseStatus.getStatus().toString()}; } BSONObj res = std::move(responseStatus.getValue()); log() << "moveChunk data transfer progress: " << res << " my mem used: " << _memoryUsed; if (res["state"].String() == "steady") { // Ensure all cloned docs have actually been transferred const std::size_t locsRemaining = _cloneLocs.size(); if (locsRemaining != 0) { return { ErrorCodes::OperationIncomplete, str::stream() << "cannot enter critical section before all data is cloned, " << locsRemaining << " locs were not transferred but to-shard thinks they are all cloned"}; } scopedGuard.Dismiss(); return Status::OK(); } if (res["state"].String() == "fail") { return {ErrorCodes::OperationFailed, "Data transfer error"}; } if (res["ns"].str() != _args.getNss().ns() || res["from"].str() != _donorCS.toString() || !res["min"].isABSONObj() || res["min"].Obj().woCompare(_args.getMinKey()) != 0 || !res["max"].isABSONObj() || res["max"].Obj().woCompare(_args.getMaxKey()) != 0) { // This can happen when the destination aborted the migration and received another // recvChunk before this thread sees the transition to the abort state. This is // currently possible only if multiple migrations are happening at once. This is an // unfortunate consequence of the shards not being able to keep track of multiple // incoming and outgoing migrations. return {ErrorCodes::OperationIncomplete, "Destination shard aborted migration because a new one is running"}; } if (_memoryUsed > 500 * 1024 * 1024) { // This is too much memory for us to use so we're going to abort the migration return {ErrorCodes::ExceededMemoryLimit, "Aborting migration because of high memory usage"}; } Status interruptStatus = txn->checkForInterruptNoAssert(); if (!interruptStatus.isOK()) { return interruptStatus; } } scopedGuard.Dismiss(); return {ErrorCodes::ExceededTimeLimit, "Timed out waiting for the cloner to catch up"}; }