Esempio n. 1
0
void ShardingState::donateChunk(OperationContext* txn,
                                const string& ns,
                                const BSONObj& min,
                                const BSONObj& max,
                                ChunkVersion version) {
    invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X));
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    CollectionMetadataMap::const_iterator it = _collMetadata.find(ns);
    verify(it != _collMetadata.end());
    shared_ptr<CollectionMetadata> p = it->second;

    // empty shards should have version 0
    version = (p->getNumChunks() > 1) ? version : ChunkVersion(0, 0, p->getCollVersion().epoch());

    ChunkType chunk;
    chunk.setMin(min);
    chunk.setMax(max);
    string errMsg;

    shared_ptr<CollectionMetadata> cloned(p->cloneMigrate(chunk, version, &errMsg));
    // uassert to match old behavior, TODO: report errors w/o throwing
    uassert(16855, errMsg, NULL != cloned.get());

    // TODO: a bit dangerous to have two different zero-version states - no-metadata and
    // no-version
    _collMetadata[ns] = cloned;
}
Esempio n. 2
0
void MetadataManager::beginReceive(const ChunkRange& range) {
    stdx::lock_guard<stdx::mutex> scopedLock(_managerLock);

    // Collection is not known to be sharded if the active metadata tracker is null
    invariant(_activeMetadataTracker);

    // If range is contained within pending chunks, this means a previous migration must have failed
    // and we need to clean all overlaps
    RangeVector overlappedChunks;
    getRangeMapOverlap(_receivingChunks, range.getMin(), range.getMax(), &overlappedChunks);

    for (const auto& overlapChunkMin : overlappedChunks) {
        auto itRecv = _receivingChunks.find(overlapChunkMin.first);
        invariant(itRecv != _receivingChunks.end());

        const ChunkRange receivingRange(itRecv->first, itRecv->second);

        _receivingChunks.erase(itRecv);

        // Make sure any potentially partially copied chunks are scheduled to be cleaned up
        _addRangeToClean_inlock(receivingRange);
    }

    // Need to ensure that the background range deleter task won't delete the range we are about to
    // receive
    _removeRangeToClean_inlock(range);
    _receivingChunks.insert(std::make_pair(range.getMin().getOwned(), range.getMax().getOwned()));

    // For compatibility with the current range deleter, update the pending chunks on the collection
    // metadata to include the chunk being received
    ChunkType chunk;
    chunk.setMin(range.getMin());
    chunk.setMax(range.getMax());
    _setActiveMetadata_inlock(_activeMetadataTracker->metadata->clonePlusPending(chunk));
}
Esempio n. 3
0
    void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) {
        scoped_lock lk( _mutex );

        CollectionMetadataMap::const_iterator it = _collMetadata.find( ns );
        verify( it != _collMetadata.end() ) ;
        CollectionMetadataPtr p = it->second;

        // empty shards should have version 0
        version =
                ( p->getNumChunks() > 1 ) ?
                        version : ChunkVersion( 0, 0, p->getCollVersion().epoch() );

        ChunkType chunk;
        chunk.setMin( min );
        chunk.setMax( max );
        string errMsg;

        CollectionMetadataPtr cloned( p->cloneMigrate( chunk, version, &errMsg ) );
        // uassert to match old behavior, TODO: report errors w/o throwing
        uassert( 16855, errMsg, NULL != cloned.get() );

        // TODO: a bit dangerous to have two different zero-version states - no-metadata and
        // no-version
        _collMetadata[ns] = cloned;
    }
Esempio n. 4
0
MigrateInfo MigrationType::toMigrateInfo() const {
    ChunkType chunk;
    chunk.setNS(_nss);
    chunk.setShard(_fromShard);
    chunk.setMin(_min);
    chunk.setMax(_max);
    chunk.setVersion(_chunkVersion);

    return MigrateInfo(_toShard, chunk);
}
Esempio n. 5
0
Status ChunkManager::createFirstChunks(OperationContext* txn,
                                       const ShardId& primaryShardId,
                                       const vector<BSONObj>* initPoints,
                                       const set<ShardId>* initShardIds) {
    // TODO distlock?
    // TODO: Race condition if we shard the collection and insert data while we split across
    // the non-primary shard.

    vector<BSONObj> splitPoints;
    vector<ShardId> shardIds;
    calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds);

    // this is the first chunk; start the versioning from scratch
    ChunkVersion version(1, 0, OID::gen());

    log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns
          << " using new epoch " << version.epoch();

    for (unsigned i = 0; i <= splitPoints.size(); i++) {
        BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1];
        BSONObj max =
            i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax();

        ChunkType chunk;
        chunk.setName(Chunk::genID(_ns, min));
        chunk.setNS(_ns);
        chunk.setMin(min);
        chunk.setMax(max);
        chunk.setShard(shardIds[i % shardIds.size()]);
        chunk.setVersion(version);

        Status status = grid.catalogManager(txn)
                        ->insertConfigDocument(txn, ChunkType::ConfigNS, chunk.toBSON());
        if (!status.isOK()) {
            const string errMsg = str::stream()
                                  << "Creating first chunks failed: " << status.reason();
            error() << errMsg;
            return Status(status.code(), errMsg);
        }

        version.incMinor();
    }

    _version = ChunkVersion(0, 0, version.epoch());

    return Status::OK();
}
Esempio n. 6
0
bool ShardingState::notePending(OperationContext* txn,
                                const string& ns,
                                const BSONObj& min,
                                const BSONObj& max,
                                const OID& epoch,
                                string* errMsg) {
    invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X));
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    CollectionMetadataMap::const_iterator it = _collMetadata.find(ns);
    if (it == _collMetadata.end()) {
        *errMsg = str::stream() << "could not note chunk "
                                << "[" << min << "," << max << ")"
                                << " as pending because the local metadata for " << ns
                                << " has changed";

        return false;
    }

    shared_ptr<CollectionMetadata> metadata = it->second;

    // This can currently happen because drops aren't synchronized with in-migrations
    // The idea for checking this here is that in the future we shouldn't have this problem
    if (metadata->getCollVersion().epoch() != epoch) {
        *errMsg = str::stream() << "could not note chunk "
                                << "[" << min << "," << max << ")"
                                << " as pending because the epoch for " << ns
                                << " has changed from " << epoch << " to "
                                << metadata->getCollVersion().epoch();

        return false;
    }

    ChunkType chunk;
    chunk.setMin(min);
    chunk.setMax(max);

    shared_ptr<CollectionMetadata> cloned(metadata->clonePlusPending(chunk, errMsg));
    if (!cloned)
        return false;

    _collMetadata[ns] = cloned;
    return true;
}
Esempio n. 7
0
    bool ShardingState::forgetPending( const string& ns,
                                       const BSONObj& min,
                                       const BSONObj& max,
                                       const OID& epoch,
                                       string* errMsg ) {
        scoped_lock lk( _mutex );

        CollectionMetadataMap::const_iterator it = _collMetadata.find( ns );
        if ( it == _collMetadata.end() ) {

            *errMsg = str::stream() << "no need to forget pending chunk "
                                    << "[" << min << "," << max << ")"
                                    << " because the local metadata for " << ns << " has changed";

            return false;
        }

        CollectionMetadataPtr metadata = it->second;

        // This can currently happen because drops aren't synchronized with in-migrations
        // The idea for checking this here is that in the future we shouldn't have this problem
        if ( metadata->getCollVersion().epoch() != epoch ) {

            *errMsg = str::stream() << "no need to forget pending chunk "
                                    << "[" << min << "," << max << ")"
                                    << " because the epoch for " << ns << " has changed from "
                                    << epoch << " to " << metadata->getCollVersion().epoch();

            return false;
        }

        ChunkType chunk;
        chunk.setMin( min );
        chunk.setMax( max );

        CollectionMetadataPtr cloned( metadata->cloneMinusPending( chunk, errMsg ) );
        if ( !cloned ) return false;

        _collMetadata[ns] = cloned;
        return true;
    }
Esempio n. 8
0
    void ShardingState::splitChunk( const string& ns,
                                    const BSONObj& min,
                                    const BSONObj& max,
                                    const vector<BSONObj>& splitKeys,
                                    ChunkVersion version )
    {
        scoped_lock lk( _mutex );

        CollectionMetadataMap::const_iterator it = _collMetadata.find( ns );
        verify( it != _collMetadata.end() ) ;

        ChunkType chunk;
        chunk.setMin( min );
        chunk.setMax( max );
        string errMsg;

        CollectionMetadataPtr cloned( it->second->cloneSplit( chunk, splitKeys, version, &errMsg ) );
        // uassert to match old behavior, TODO: report errors w/o throwing
        uassert( 16857, errMsg, NULL != cloned.get() );

        _collMetadata[ns] = cloned;
    }
Esempio n. 9
0
        /**
         * Stores ranges for a particular collection and shard starting from some version
         */
        void storeCollectionRanges( const NamespaceString& nss,
                                    const string& shardName,
                                    const vector<KeyRange>& ranges,
                                    const ChunkVersion& startVersion ) {

            // Get key pattern from first range
            ASSERT_GREATER_THAN( ranges.size(), 0u );

            CollectionType coll;
            coll.setNS( nss.ns() );
            coll.setKeyPattern( ranges.begin()->keyPattern );
            coll.setEpoch( startVersion.epoch() );
            coll.setUpdatedAt( 1ULL );
            string errMsg;
            ASSERT( coll.isValid( &errMsg ) );

            DBDirectClient client(&_txn);

            client.update( CollectionType::ConfigNS,
                           BSON( CollectionType::ns( coll.getNS() ) ),
                           coll.toBSON(), true, false );

            ChunkVersion nextVersion = startVersion;
            for ( vector<KeyRange>::const_iterator it = ranges.begin(); it != ranges.end(); ++it ) {

                ChunkType chunk;
                // TODO: We should not rely on the serialized ns, minkey being unique in the future,
                // causes problems since it links string serialization to correctness.
                chunk.setName( Chunk::genID( nss, it->minKey ) );
                chunk.setShard( shardName );
                chunk.setNS( nss.ns() );
                chunk.setVersion( nextVersion );
                chunk.setMin( it->minKey );
                chunk.setMax( it->maxKey );
                nextVersion.incMajor();

                client.insert( ChunkType::ConfigNS, chunk.toBSON() );
            }
        }
Esempio n. 10
0
void MetadataManager::forgetReceive(const ChunkRange& range) {
    stdx::lock_guard<stdx::mutex> scopedLock(_managerLock);

    {
        auto it = _receivingChunks.find(range.getMin());
        invariant(it != _receivingChunks.end());

        // Verify entire ChunkRange is identical, not just the min key.
        invariant(it->second == range.getMax());

        _receivingChunks.erase(it);
    }

    // This is potentially a partially received data, which needs to be cleaned up
    _addRangeToClean_inlock(range);

    // For compatibility with the current range deleter, update the pending chunks on the collection
    // metadata to exclude the chunk being received, which was added in beginReceive
    ChunkType chunk;
    chunk.setMin(range.getMin());
    chunk.setMax(range.getMax());
    _setActiveMetadata_inlock(_activeMetadataTracker->metadata->cloneMinusPending(chunk));
}
Esempio n. 11
0
void ShardingState::splitChunk(OperationContext* txn,
                               const string& ns,
                               const BSONObj& min,
                               const BSONObj& max,
                               const vector<BSONObj>& splitKeys,
                               ChunkVersion version) {
    invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X));
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    CollectionMetadataMap::const_iterator it = _collMetadata.find(ns);
    verify(it != _collMetadata.end());

    ChunkType chunk;
    chunk.setMin(min);
    chunk.setMax(max);
    string errMsg;

    shared_ptr<CollectionMetadata> cloned(
        it->second->cloneSplit(chunk, splitKeys, version, &errMsg));
    // uassert to match old behavior, TODO: report errors w/o throwing
    uassert(16857, errMsg, NULL != cloned.get());

    _collMetadata[ns] = cloned;
}
Esempio n. 12
0
void DistributionStatus::populateShardToChunksMap(const ShardStatisticsVector& allShards,
                                                  const ChunkManager& chunkMgr,
                                                  ShardToChunksMap* shardToChunksMap) {
    // Makes sure there is an entry in shardToChunksMap for every shard.
    for (const auto& stat : allShards) {
        (*shardToChunksMap)[stat.shardId];
    }

    const ChunkMap& chunkMap = chunkMgr.getChunkMap();
    for (ChunkMap::const_iterator it = chunkMap.begin(); it != chunkMap.end(); ++it) {
        const ChunkPtr chunkPtr = it->second;

        ChunkType chunk;
        chunk.setNS(chunkMgr.getns());
        chunk.setMin(chunkPtr->getMin().getOwned());
        chunk.setMax(chunkPtr->getMax().getOwned());
        chunk.setJumbo(chunkPtr->isJumbo());  // TODO: is this reliable?

        const string shardName(chunkPtr->getShardId());
        chunk.setShard(shardName);

        (*shardToChunksMap)[shardName].push_back(chunk);
    }
}
Esempio n. 13
0
bool Chunk::splitIfShould(long dataWritten) const {
    dassert(ShouldAutoSplit);
    LastError::Disabled d(&LastError::get(cc()));

    try {
        _dataWritten += dataWritten;
        int splitThreshold = getManager()->getCurrentDesiredChunkSize();
        if (_minIsInf() || _maxIsInf()) {
            splitThreshold = (int)((double)splitThreshold * .9);
        }

        if (_dataWritten < splitThreshold / ChunkManager::SplitHeuristics::splitTestFactor)
            return false;

        if (!getManager()->_splitHeuristics._splitTickets.tryAcquire()) {
            LOG(1) << "won't auto split because not enough tickets: " << getManager()->getns();
            return false;
        }

        TicketHolderReleaser releaser(&(getManager()->_splitHeuristics._splitTickets));

        // this is a bit ugly
        // we need it so that mongos blocks for the writes to actually be committed
        // this does mean mongos has more back pressure than mongod alone
        // since it nots 100% tcp queue bound
        // this was implicit before since we did a splitVector on the same socket
        ShardConnection::sync();

        LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten
               << " splitThreshold: " << splitThreshold;

        BSONObj res;
        size_t splitCount = 0;
        Status status = split(Chunk::autoSplitInternal, &splitCount, &res);
        if (!status.isOK()) {
            // Split would have issued a message if we got here. This means there wasn't enough
            // data to split, so don't want to try again until considerable more data
            _dataWritten = 0;
            return false;
        }

        if (_maxIsInf() || _minIsInf()) {
            // we don't want to reset _dataWritten since we kind of want to check the other side
            // right away
        } else {
            // we're splitting, so should wait a bit
            _dataWritten = 0;
        }

        bool shouldBalance = grid.getConfigShouldBalance();
        if (shouldBalance) {
            auto status = grid.catalogManager()->getCollection(_manager->getns());
            if (!status.isOK()) {
                log() << "Auto-split for " << _manager->getns()
                      << " failed to load collection metadata due to " << status.getStatus();
                return false;
            }

            shouldBalance = status.getValue().getAllowBalance();
        }

        log() << "autosplitted " << _manager->getns() << " shard: " << toString() << " into "
              << (splitCount + 1) << " (splitThreshold " << splitThreshold << ")"
              << (res["shouldMigrate"].eoo() ? "" : (string) " (migrate suggested" +
                          (shouldBalance ? ")" : ", but no migrations allowed)"));

        // Top chunk optimization - try to move the top chunk out of this shard
        // to prevent the hot spot from staying on a single shard. This is based on
        // the assumption that succeeding inserts will fall on the top chunk.
        BSONElement shouldMigrate = res["shouldMigrate"];  // not in mongod < 1.9.1 but that is ok
        if (!shouldMigrate.eoo() && shouldBalance) {
            BSONObj range = shouldMigrate.embeddedObject();

            ChunkType chunkToMove;
            {
                const auto shard = grid.shardRegistry()->getShard(getShardId());
                chunkToMove.setShard(shard->toString());
            }
            chunkToMove.setMin(range["min"].embeddedObject());
            chunkToMove.setMax(range["max"].embeddedObject());

            tryMoveToOtherShard(*_manager, chunkToMove);
        }

        return true;

    } catch (DBException& e) {
        // TODO: Make this better - there are lots of reasons a split could fail
        // Random so that we don't sync up with other failed splits
        _dataWritten = mkDataWritten();

        // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail.
        warning() << "could not autosplit collection " << _manager->getns() << causedBy(e);
        return false;
    }
}
Esempio n. 14
0
    bool mergeChunks( OperationContext* txn,
                      const NamespaceString& nss,
                      const BSONObj& minKey,
                      const BSONObj& maxKey,
                      const OID& epoch,
                      string* errMsg ) {

        //
        // Get sharding state up-to-date
        //

        ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(),
                                                              *errMsg );
        if ( !configLoc.isValid() ){
            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get the distributed lock
        //

        ScopedDistributedLock collLock( configLoc, nss.ns() );
        collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from "
                                          << minKey << " to " << maxKey );

        Status acquisitionStatus = collLock.tryAcquire();
        if (!acquisitionStatus.isOK()) {
            *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                               << " to merge chunks in [" << minKey << "," << maxKey << ")"
                               << causedBy(acquisitionStatus);

            warning() << *errMsg << endl;
            return false;
        }

        //
        // We now have the collection lock, refresh metadata to latest version and sanity check
        //

        ChunkVersion shardVersion;
        Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion);

        if ( !status.isOK() ) {

            *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                    << nss.ns() << causedBy( status.reason() );

            warning() << *errMsg << endl;
            return false;
        }

        if ( epoch.isSet() && shardVersion.epoch() != epoch ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has changed" << " since merge was sent" << "(sent epoch : "
                               << epoch.toString()
                               << ", current epoch : " << shardVersion.epoch().toString() << ")";

            warning() << *errMsg << endl;
            return false;
        }

        CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() );

        if ( !metadata || metadata->getKeyPattern().isEmpty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " is not sharded";

            warning() << *errMsg << endl;
            return false;
        }

        dassert( metadata->getShardVersion().equals( shardVersion ) );

        if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) {

            *errMsg = stream() << "could not merge chunks, the range "
                               << rangeToString( minKey, maxKey ) << " is not valid"
                               << " for collection " << nss.ns() << " with key pattern "
                               << metadata->getKeyPattern();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get merged chunk information
        //

        ChunkVersion mergeVersion = metadata->getCollVersion();
        mergeVersion.incMinor();

        OwnedPointerVector<ChunkType> chunksToMerge;

        ChunkType itChunk;
        itChunk.setMin( minKey );
        itChunk.setMax( minKey );
        itChunk.setNS( nss.ns() );
        itChunk.setShard( shardingState.getShardName() );

        while ( itChunk.getMax().woCompare( maxKey ) < 0 &&
                metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) {
            auto_ptr<ChunkType> saved( new ChunkType );
            itChunk.cloneTo( saved.get() );
            chunksToMerge.mutableVector().push_back( saved.release() );
        }

        if ( chunksToMerge.empty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " and ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Validate the range starts and ends at chunks and has no holes, error if not valid
        //

        BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin();
        BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax();
        // minKey is inclusive
        bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey );

        if ( !minKeyInRange ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin();
        BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax();
        // maxKey is exclusive
        bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 &&
                lastDocMax.woCompare( maxKey ) >= 0;

        if ( !maxKeyInRange ) {
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0;
        bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0;

        if ( !validRangeStartKey || !validRangeEndKey ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " does not contain a chunk "
                               << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" )
                               << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" )
                               << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" );

            warning() << *errMsg << endl;
            return false;
        }

        if ( chunksToMerge.size() == 1 ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " already contains chunk for " << rangeToString( minKey, maxKey );

            warning() << *errMsg << endl;
            return false;
        }

        bool holeInRange = false;

        // Look for hole in range
        ChunkType* prevChunk = *chunksToMerge.begin();
        ChunkType* nextChunk = NULL;
        for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin();
                it != chunksToMerge.end(); ++it ) {
            if ( it == chunksToMerge.begin() ) continue;

            nextChunk = *it;
            if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) {
                holeInRange = true;
                break;
            }
            prevChunk = nextChunk;
        }

        if ( holeInRange ) {

            dassert( NULL != nextChunk );
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has a hole in the range " << rangeToString( minKey, maxKey )
                               << " at " << rangeToString( prevChunk->getMax(),
                                                           nextChunk->getMin() );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Run apply ops command
        //

        BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge,
                                                shardVersion,
                                                mergeVersion );

        bool ok;
        BSONObj result;
        try {
            ScopedDbConnection conn( configLoc, 30.0 );
            ok = conn->runCommand( "config", applyOpsCmd, result );
            if ( !ok ) *errMsg = result.toString();
            conn.done();
        }
        catch( const DBException& ex ) {
            ok = false;
            *errMsg = ex.toString();
        }

        if ( !ok ) {
            *errMsg = stream() << "could not merge chunks for " << nss.ns()
                               << ", writing to config failed" << causedBy( errMsg );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Install merged chunk metadata
        //

        {
            Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X);
            shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
        }

        //
        // Log change
        //

        BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge,
                                                    shardVersion,
                                                    mergeVersion );

        configServer.logChange( "merge", nss.ns(), mergeLogEntry );

        return true;
    }
Esempio n. 15
0
bool mergeChunks(OperationContext* txn,
                 const NamespaceString& nss,
                 const BSONObj& minKey,
                 const BSONObj& maxKey,
                 const OID& epoch,
                 string* errMsg) {
    // Get the distributed lock
    string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to "
                                 << maxKey;
    auto scopedDistLock = grid.catalogManager(txn)->distLock(
        txn, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout);

    if (!scopedDistLock.isOK()) {
        *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                           << " to merge chunks in [" << minKey << "," << maxKey << ")"
                           << causedBy(scopedDistLock.getStatus());

        warning() << *errMsg;
        return false;
    }

    ShardingState* shardingState = ShardingState::get(txn);

    //
    // We now have the collection lock, refresh metadata to latest version and sanity check
    //

    ChunkVersion shardVersion;
    Status status = shardingState->refreshMetadataNow(txn, nss.ns(), &shardVersion);

    if (!status.isOK()) {
        *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                << nss.ns() << causedBy(status.reason());

        warning() << *errMsg;
        return false;
    }

    if (epoch.isSet() && shardVersion.epoch() != epoch) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed"
                           << " since merge was sent"
                           << "(sent epoch : " << epoch.toString()
                           << ", current epoch : " << shardVersion.epoch().toString() << ")";

        warning() << *errMsg;
        return false;
    }

    shared_ptr<CollectionMetadata> metadata = shardingState->getCollectionMetadata(nss.ns());

    if (!metadata || metadata->getKeyPattern().isEmpty()) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " is not sharded";

        warning() << *errMsg;
        return false;
    }

    dassert(metadata->getShardVersion().equals(shardVersion));

    if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) {
        *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey)
                           << " is not valid"
                           << " for collection " << nss.ns() << " with key pattern "
                           << metadata->getKeyPattern();

        warning() << *errMsg;
        return false;
    }

    //
    // Get merged chunk information
    //

    ChunkVersion mergeVersion = metadata->getCollVersion();
    mergeVersion.incMinor();

    std::vector<ChunkType> chunksToMerge;

    ChunkType itChunk;
    itChunk.setMin(minKey);
    itChunk.setMax(minKey);
    itChunk.setNS(nss.ns());
    itChunk.setShard(shardingState->getShardName());

    while (itChunk.getMax().woCompare(maxKey) < 0 &&
           metadata->getNextChunk(itChunk.getMax(), &itChunk)) {
        chunksToMerge.push_back(itChunk);
    }

    if (chunksToMerge.empty()) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range starting at " << minKey << " and ending at " << maxKey
                           << " does not belong to shard " << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    //
    // Validate the range starts and ends at chunks and has no holes, error if not valid
    //

    BSONObj firstDocMin = chunksToMerge.front().getMin();
    BSONObj firstDocMax = chunksToMerge.front().getMax();
    // minKey is inclusive
    bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey);

    if (!minKeyInRange) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range starting at " << minKey << " does not belong to shard "
                           << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    BSONObj lastDocMin = chunksToMerge.back().getMin();
    BSONObj lastDocMax = chunksToMerge.back().getMax();
    // maxKey is exclusive
    bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0;

    if (!maxKeyInRange) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range ending at " << maxKey << " does not belong to shard "
                           << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0;
    bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0;

    if (!validRangeStartKey || !validRangeEndKey) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " does not contain a chunk "
                           << (!validRangeStartKey ? "starting at " + minKey.toString() : "")
                           << (!validRangeStartKey && !validRangeEndKey ? " or " : "")
                           << (!validRangeEndKey ? "ending at " + maxKey.toString() : "");

        warning() << *errMsg;
        return false;
    }

    if (chunksToMerge.size() == 1) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " already contains chunk for " << rangeToString(minKey, maxKey);

        warning() << *errMsg;
        return false;
    }

    // Look for hole in range
    for (size_t i = 1; i < chunksToMerge.size(); ++i) {
        if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) {
            *errMsg =
                stream() << "could not merge chunks, collection " << nss.ns()
                         << " has a hole in the range " << rangeToString(minKey, maxKey) << " at "
                         << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin());

            warning() << *errMsg;
            return false;
        }
    }

    //
    // Run apply ops command
    //
    Status applyOpsStatus = runApplyOpsCmd(txn, chunksToMerge, shardVersion, mergeVersion);
    if (!applyOpsStatus.isOK()) {
        warning() << applyOpsStatus;
        return false;
    }

    //
    // Install merged chunk metadata
    //

    {
        ScopedTransaction transaction(txn, MODE_IX);
        Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX);
        Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X);

        shardingState->mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
    }

    //
    // Log change
    //

    BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion);

    grid.catalogManager(txn)->logChange(txn, "merge", nss.ns(), mergeLogEntry);

    return true;
}
Esempio n. 16
0
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx,
                                           ChunkManager* manager,
                                           Chunk* chunk,
                                           long dataWritten) {
    // Disable lastError tracking so that any errors, which occur during auto-split do not get
    // bubbled up on the client connection doing a write.
    LastError::Disabled d(&LastError::get(cc()));

    const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration();

    const bool minIsInf =
        (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin()));
    const bool maxIsInf =
        (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax()));

    const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten);

    const uint64_t desiredChunkSize =
        calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks());

    if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) {
        return;
    }

    const NamespaceString nss(manager->getns());

    if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) {
        LOG(1) << "won't auto split because not enough tickets: " << nss;
        return;
    }

    TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets));

    const ChunkRange chunkRange(chunk->getMin(), chunk->getMax());

    try {
        // Ensure we have the most up-to-date balancer configuration
        uassertStatusOK(balancerConfig->refreshAndCheck(opCtx));

        if (!balancerConfig->getShouldAutoSplit()) {
            return;
        }

        LOG(1) << "about to initiate autosplit: " << redact(chunk->toString())
               << " dataWritten: " << chunkBytesWritten
               << " desiredChunkSize: " << desiredChunkSize;

        const uint64_t chunkSizeToUse = [&]() {
            const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2;

            if (estNumSplitPoints >= kTooManySplitPoints) {
                // The current desired chunk size will split the chunk into lots of small chunk and
                // at the worst case this can result into thousands of chunks. So check and see if a
                // bigger value can be used.
                return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes());
            } else {
                return desiredChunkSize;
            }
        }();

        auto splitPoints =
            uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx,
                                                              chunk->getShardId(),
                                                              nss,
                                                              manager->getShardKeyPattern(),
                                                              chunkRange,
                                                              chunkSizeToUse,
                                                              boost::none));

        if (splitPoints.size() <= 1) {
            // No split points means there isn't enough data to split on; 1 split point means we
            // have
            // between half the chunk size to full chunk size so there is no need to split yet
            chunk->clearBytesWritten();
            return;
        }

        if (minIsInf || maxIsInf) {
            // We don't want to reset _dataWritten since we want to check the other side right away
        } else {
            // We're splitting, so should wait a bit
            chunk->clearBytesWritten();
        }

        // We assume that if the chunk being split is the first (or last) one on the collection,
        // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the
        // very first (or last) key as a split point.
        //
        // This heuristic is skipped for "special" shard key patterns that are not likely to produce
        // monotonically increasing or decreasing values (e.g. hashed shard keys).
        if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) {
            if (minIsInf) {
                BSONObj key = findExtremeKeyForShard(
                    opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true);
                if (!key.isEmpty()) {
                    splitPoints.front() = key.getOwned();
                }
            } else if (maxIsInf) {
                BSONObj key = findExtremeKeyForShard(
                    opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false);
                if (!key.isEmpty()) {
                    splitPoints.back() = key.getOwned();
                }
            }
        }

        const auto suggestedMigrateChunk =
            uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx,
                                                                  chunk->getShardId(),
                                                                  nss,
                                                                  manager->getShardKeyPattern(),
                                                                  manager->getVersion(),
                                                                  chunkRange,
                                                                  splitPoints));

        // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk
        // to balance
        const bool shouldBalance = [&]() {
            if (!balancerConfig->shouldBalanceForAutoSplit())
                return false;

            auto collStatus =
                Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns());
            if (!collStatus.isOK()) {
                log() << "Auto-split for " << nss << " failed to load collection metadata"
                      << causedBy(redact(collStatus.getStatus()));
                return false;
            }

            return collStatus.getValue().value.getAllowBalance();
        }();

        log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into "
              << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")"
              << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" +
                          (shouldBalance ? ")" : ", but no migrations allowed)"));

        // Reload the chunk manager after the split
        auto routingInfo = uassertStatusOK(
            Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx,
                                                                                         nss));

        if (!shouldBalance || !suggestedMigrateChunk) {
            return;
        }

        // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot
        // spot from staying on a single shard. This is based on the assumption that succeeding
        // inserts will fall on the top chunk.

        // We need to use the latest chunk manager (after the split) in order to have the most
        // up-to-date view of the chunk we are about to move
        auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation(
            suggestedMigrateChunk->getMin());

        ChunkType chunkToMove;
        chunkToMove.setNS(nss.ns());
        chunkToMove.setShard(suggestedChunk->getShardId());
        chunkToMove.setMin(suggestedChunk->getMin());
        chunkToMove.setMax(suggestedChunk->getMax());
        chunkToMove.setVersion(suggestedChunk->getLastmod());

        uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove));

        // Ensure the collection gets reloaded because of the move
        Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss);
    } catch (const DBException& ex) {
        chunk->clearBytesWritten();

        if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) {
            log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex)
                  << ", going to invalidate routing table entry for " << nss;
            Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss);
        }
    }
}
Esempio n. 17
0
bool Chunk::splitIfShould(OperationContext* txn, long dataWritten) {
    LastError::Disabled d(&LastError::get(cc()));

    try {
        _dataWritten += dataWritten;
        uint64_t splitThreshold = _manager->getCurrentDesiredChunkSize();
        if (_minIsInf() || _maxIsInf()) {
            splitThreshold = static_cast<uint64_t>((double)splitThreshold * 0.9);
        }

        if (_dataWritten < splitThreshold / ChunkManager::SplitHeuristics::splitTestFactor) {
            return false;
        }

        if (!_manager->_splitHeuristics._splitTickets.tryAcquire()) {
            LOG(1) << "won't auto split because not enough tickets: " << _manager->getns();
            return false;
        }

        TicketHolderReleaser releaser(&(_manager->_splitHeuristics._splitTickets));

        const auto balancerConfig = Grid::get(txn)->getBalancerConfiguration();

        Status refreshStatus = balancerConfig->refreshAndCheck(txn);
        if (!refreshStatus.isOK()) {
            warning() << "Unable to refresh balancer settings" << causedBy(refreshStatus);
            return false;
        }

        bool shouldAutoSplit = balancerConfig->getShouldAutoSplit();
        if (!shouldAutoSplit) {
            return false;
        }

        LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten
               << " splitThreshold: " << splitThreshold;

        size_t splitCount = 0;
        auto splitStatus = split(txn, Chunk::autoSplitInternal, &splitCount);
        if (!splitStatus.isOK()) {
            // Split would have issued a message if we got here. This means there wasn't enough
            // data to split, so don't want to try again until considerable more data
            _dataWritten = 0;
            return false;
        }

        if (_maxIsInf() || _minIsInf()) {
            // we don't want to reset _dataWritten since we kind of want to check the other side
            // right away
        } else {
            // we're splitting, so should wait a bit
            _dataWritten = 0;
        }

        bool shouldBalance = balancerConfig->shouldBalanceForAutoSplit();

        if (shouldBalance) {
            auto collStatus = grid.catalogClient(txn)->getCollection(txn, _manager->getns());
            if (!collStatus.isOK()) {
                warning() << "Auto-split for " << _manager->getns()
                          << " failed to load collection metadata"
                          << causedBy(collStatus.getStatus());
                return false;
            }

            shouldBalance = collStatus.getValue().value.getAllowBalance();
        }

        const auto suggestedMigrateChunk = std::move(splitStatus.getValue());

        log() << "autosplitted " << _manager->getns() << " shard: " << toString() << " into "
              << (splitCount + 1) << " (splitThreshold " << splitThreshold << ")"
              << (suggestedMigrateChunk ? "" : (string) " (migrate suggested" +
                          (shouldBalance ? ")" : ", but no migrations allowed)"));

        // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot
        // spot from staying on a single shard. This is based on the assumption that succeeding
        // inserts will fall on the top chunk.
        if (suggestedMigrateChunk && shouldBalance) {
            const NamespaceString nss(_manager->getns());

            // We need to use the latest chunk manager (after the split) in order to have the most
            // up-to-date view of the chunk we are about to move
            auto scopedCM = uassertStatusOK(ScopedChunkManager::getExisting(txn, nss));
            auto suggestedChunk =
                scopedCM.cm()->findIntersectingChunk(txn, suggestedMigrateChunk->getMin());

            ChunkType chunkToMove;
            chunkToMove.setNS(nss.ns());
            chunkToMove.setShard(suggestedChunk->getShardId());
            chunkToMove.setMin(suggestedChunk->getMin());
            chunkToMove.setMax(suggestedChunk->getMax());
            chunkToMove.setVersion(suggestedChunk->getLastmod());

            Status rebalanceStatus = Balancer::get(txn)->rebalanceSingleChunk(txn, chunkToMove);
            if (!rebalanceStatus.isOK()) {
                msgassertedNoTraceWithStatus(10412, rebalanceStatus);
            }

            _manager->reload(txn);
        }

        return true;
    } catch (const DBException& e) {
        // TODO: Make this better - there are lots of reasons a split could fail
        // Random so that we don't sync up with other failed splits
        _dataWritten = mkDataWritten();

        // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail.
        warning() << "could not autosplit collection " << _manager->getns() << causedBy(e);
        return false;
    }
}
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) {
    invariant(!txn->lockState()->isLocked());
    invariant(_state == kCriticalSection);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

    // Tell the recipient shard to fetch the latest changes
    Status commitCloneStatus = _cloneDriver->commitClone(txn);

    if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) {
        commitCloneStatus = {ErrorCodes::InternalError,
                             "Failing _recvChunkCommit due to failpoint."};
    }

    if (!commitCloneStatus.isOK()) {
        return {commitCloneStatus.code(),
                str::stream() << "commit clone failed due to " << commitCloneStatus.toString()};
    }

    // Generate the next collection version.
    ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion();
    uncommittedCollVersion.incMajor();

    // applyOps preparation for reflecting the uncommitted metadata on the config server

    // Preconditions
    BSONArrayBuilder preCond;
    {
        BSONObjBuilder b;
        b.append("ns", ChunkType::ConfigNS);
        b.append("q",
                 BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby"
                              << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
        {
            BSONObjBuilder bb(b.subobjStart("res"));

            // TODO: For backwards compatibility, we can't yet require an epoch here
            bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(),
                               _committedMetadata->getCollVersion().toLong());
            bb.done();
        }

        preCond.append(b.obj());
    }

    // Update for the chunk which is being donated
    BSONArrayBuilder updates;
    {
        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);  // No upserting
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey()));
        uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _args.getNss().ns());
        n.append(ChunkType::min(), _args.getMinKey());
        n.append(ChunkType::max(), _args.getMaxKey());
        n.append(ChunkType::shard(), _args.getToShardId());
        n.done();

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey()));
        q.done();

        updates.append(op.obj());
    }

    // Update for the chunk being moved

    // Version at which the next highest lastmod will be set. If the chunk being moved is the last
    // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the
    // chunk being bumped on the FROM-shard.
    ChunkVersion nextVersion = uncommittedCollVersion;

    // If we have chunks left on the FROM shard, update the version of one of them as well. We can
    // figure that out by grabbing the metadata as it has been changed.
    if (_committedMetadata->getNumChunks() > 1) {
        ChunkType bumpChunk;
        invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk));

        BSONObj bumpMin = bumpChunk.getMin();
        BSONObj bumpMax = bumpChunk.getMax();
        nextVersion.incMinor();

        dassert(bumpMin.woCompare(_args.getMinKey()) != 0);

        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin));
        nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _args.getNss().ns());
        n.append(ChunkType::min(), bumpMin);
        n.append(ChunkType::max(), bumpMax);
        n.append(ChunkType::shard(), _args.getFromShardId());
        n.done();

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin));
        q.done();

        updates.append(op.obj());

        log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin
              << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'";
    } else {
        log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'";
    }

    MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeCommitMigration);

    Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated(
        txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion);

    if (MONGO_FAIL_POINT(failCommitMigrationCommand)) {
        applyOpsStatus = Status(ErrorCodes::InternalError,
                                "Failpoint 'failCommitMigrationCommand' generated error");
    }

    if (applyOpsStatus.isOK()) {
        // Now that applyOps succeeded and the new collection version is committed, update the
        // collection metadata to the new collection version and forget the migrated chunk.
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        ChunkType migratingChunkToForget;
        migratingChunkToForget.setMin(_args.getMinKey());
        migratingChunkToForget.setMax(_args.getMaxKey());
        _committedMetadata =
            _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion);
        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
        css->setMetadata(_committedMetadata);
    } else {
        // This could be an unrelated error (e.g. network error). Check whether the metadata update
        // succeeded by refreshing the collection metadata from the config server and checking that
        // the original chunks no longer exist.

        warning() << "Migration metadata commit may have failed: refreshing metadata to check"
                  << causedBy(applyOpsStatus);

        // Need to get the latest optime in case the refresh request goes to a secondary --
        // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done.
        Status status = grid.catalogClient(txn)->logChange(
            txn,
            "moveChunk.validating",
            _args.getNss().ns(),
            BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from"
                       << _args.getFromShardId()
                       << "to"
                       << _args.getToShardId()));
        if (!status.isOK()) {
            fassertStatusOK(
                40137,
                {status.code(),
                 str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << ","
                               << _args.getMaxKey()
                               << ") due to "
                               << causedBy(applyOpsStatus)
                               << ", and updating the optime with a write before refreshing the "
                               << "metadata also failed: "
                               << causedBy(status)});
        }

        ShardingState* const shardingState = ShardingState::get(txn);
        ChunkVersion shardVersion;
        Status refreshStatus =
            shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion);
        fassertStatusOK(34431,
                        {refreshStatus.code(),
                         str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey()
                                       << ","
                                       << _args.getMaxKey()
                                       << ") due to "
                                       << causedBy(applyOpsStatus)
                                       << ", and refreshing collection metadata failed: "
                                       << causedBy(refreshStatus)});

        {
            ScopedTransaction scopedXact(txn, MODE_IS);
            AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

            auto css = CollectionShardingState::get(txn, _args.getNss());
            std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata();

            if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) {
                invariant(refreshedMetadata->getCollVersion() ==
                          _committedMetadata->getCollVersion());

                // After refresh, the collection metadata indicates that the donor shard still owns
                // the chunk, so no migration changes were written to the config server metadata.
                return {applyOpsStatus.code(),
                        str::stream() << "Migration was not committed, applyOps failed: "
                                      << causedBy(applyOpsStatus)};
            }

            ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion();
            if (!refreshedCollectionVersion.equals(nextVersion)) {
                // The refreshed collection metadata's collection version does not match the control
                // chunk's updated collection version, which should now be the highest. The control
                // chunk was not committed, but the migrated chunk was. This state is not
                // recoverable.
                fassertStatusOK(40138,
                                {applyOpsStatus.code(),
                                 str::stream() << "Migration was partially committed, state is "
                                               << "unrecoverable. applyOps error: "
                                               << causedBy(applyOpsStatus)});
            }
        }
    }

    MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection);

    scopedGuard.Dismiss();
    _cleanup(txn);

    grid.catalogClient(txn)->logChange(txn,
                                       "moveChunk.commit",
                                       _args.getNss().ns(),
                                       BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey()
                                                  << "from"
                                                  << _args.getFromShardId()
                                                  << "to"
                                                  << _args.getToShardId()));

    return Status::OK();
}
Esempio n. 19
0
void MetadataManager::refreshActiveMetadata(std::unique_ptr<CollectionMetadata> remoteMetadata) {
    LOG(1) << "Refreshing the active metadata from "
           << (_activeMetadataTracker->metadata ? _activeMetadataTracker->metadata->toStringBasic()
                                                : "(empty)")
           << ", to " << (remoteMetadata ? remoteMetadata->toStringBasic() : "(empty)");

    stdx::lock_guard<stdx::mutex> scopedLock(_managerLock);

    // Collection is not sharded anymore
    if (!remoteMetadata) {
        log() << "Marking collection as not sharded.";

        _receivingChunks.clear();
        _rangesToClean.clear();

        _setActiveMetadata_inlock(nullptr);
        return;
    }

    invariant(!remoteMetadata->getCollVersion().isWriteCompatibleWith(ChunkVersion::UNSHARDED()));
    invariant(!remoteMetadata->getShardVersion().isWriteCompatibleWith(ChunkVersion::UNSHARDED()));

    // Collection is not sharded currently
    if (!_activeMetadataTracker->metadata) {
        log() << "Marking collection as sharded with version " << remoteMetadata->toStringBasic();

        invariant(_receivingChunks.empty());
        invariant(_rangesToClean.empty());

        _setActiveMetadata_inlock(std::move(remoteMetadata));
        return;
    }

    // If the metadata being installed has a different epoch from ours, this means the collection
    // was dropped and recreated, so we must entirely reset the metadata state
    if (_activeMetadataTracker->metadata->getCollVersion().epoch() !=
        remoteMetadata->getCollVersion().epoch()) {
        log() << "Overwriting collection metadata due to epoch change.";

        _receivingChunks.clear();
        _rangesToClean.clear();

        _setActiveMetadata_inlock(std::move(remoteMetadata));
        return;
    }

    // We already have newer version
    if (_activeMetadataTracker->metadata->getCollVersion() >= remoteMetadata->getCollVersion()) {
        LOG(1) << "Attempted to refresh active metadata "
               << _activeMetadataTracker->metadata->toStringBasic() << " with an older version "
               << remoteMetadata->toStringBasic();

        return;
    }

    // Resolve any receiving chunks, which might have completed by now
    for (auto it = _receivingChunks.begin(); it != _receivingChunks.end();) {
        const BSONObj min = it->first;
        const BSONObj max = it->second;

        // Our pending range overlaps at least one chunk
        if (rangeMapContains(remoteMetadata->getChunks(), min, max)) {
            // The remote metadata contains a chunk we were earlier in the process of receiving, so
            // we deem it successfully received.
            LOG(2) << "Verified chunk " << ChunkRange(min, max).toString()
                   << " was migrated earlier to this shard";

            _receivingChunks.erase(it++);
            continue;
        } else if (!rangeMapOverlaps(remoteMetadata->getChunks(), min, max)) {
            ++it;
            continue;
        }

        // Partial overlap indicates that the earlier migration has failed, but the chunk being
        // migrated underwent some splits and other migrations and ended up here again. In this
        // case, we will request full reload of the metadata. Currently this cannot happen, because
        // all migrations are with the explicit knowledge of the recipient shard. However, we leave
        // the option open so that chunk splits can do empty chunk move without having to notify the
        // recipient.
        RangeVector overlappedChunks;
        getRangeMapOverlap(remoteMetadata->getChunks(), min, max, &overlappedChunks);

        for (const auto& overlapChunkMin : overlappedChunks) {
            auto itRecv = _receivingChunks.find(overlapChunkMin.first);
            invariant(itRecv != _receivingChunks.end());

            const ChunkRange receivingRange(itRecv->first, itRecv->second);

            _receivingChunks.erase(itRecv);

            // Make sure any potentially partially copied chunks are scheduled to be cleaned up
            _addRangeToClean_inlock(receivingRange);
        }

        // Need to reset the iterator
        it = _receivingChunks.begin();
    }

    // For compatibility with the current range deleter, which is driven entirely by the contents of
    // the CollectionMetadata update the pending chunks
    for (const auto& receivingChunk : _receivingChunks) {
        ChunkType chunk;
        chunk.setMin(receivingChunk.first);
        chunk.setMax(receivingChunk.second);
        remoteMetadata = remoteMetadata->clonePlusPending(chunk);
    }

    _setActiveMetadata_inlock(std::move(remoteMetadata));
}
StatusWith<MigrateInfoVector> BalancerChunkSelectionPolicyImpl::_getMigrateCandidatesForCollection(
    OperationContext* txn,
    const NamespaceString& nss,
    const ShardStatisticsVector& shardStats,
    bool aggressiveBalanceHint) {
    // Ensure the database exists
    auto dbStatus = Grid::get(txn)->catalogCache()->getDatabase(txn, nss.db().toString());
    if (!dbStatus.isOK()) {
        return {dbStatus.getStatus().code(),
                str::stream() << "Database " << nss.ns() << " was not found due to "
                              << dbStatus.getStatus().toString()};
    }

    shared_ptr<DBConfig> db = dbStatus.getValue();
    invariant(db);

    // Ensure that the collection is sharded
    shared_ptr<ChunkManager> cm = db->getChunkManagerIfExists(txn, nss.ns(), true);
    if (!cm) {
        return {ErrorCodes::NamespaceNotSharded,
                str::stream() << "Collection " << nss.ns() << " does not exist or is not sharded."};
    }

    if (cm->getChunkMap().empty()) {
        return {ErrorCodes::NamespaceNotSharded,
                str::stream() << "Collection " << nss.ns() << " does not have any chunks."};
    }

    ShardToChunksMap shardToChunksMap;
    std::set<BSONObj> allChunkMinimums;

    for (const auto& entry : cm->getChunkMap()) {
        const auto& chunkEntry = entry.second;

        ChunkType chunk;
        chunk.setMin(chunkEntry->getMin());
        chunk.setMax(chunkEntry->getMax());
        chunk.setJumbo(chunkEntry->isJumbo());

        shardToChunksMap[chunkEntry->getShardId()].push_back(chunk);
        allChunkMinimums.insert(chunkEntry->getMin());
    }

    for (const auto& stat : shardStats) {
        // This loop just makes sure there is an entry in shardToChunksMap for every shard, which we
        // plan to consider.
        shardToChunksMap[stat.shardId];
    }

    DistributionStatus distStatus(shardStats, shardToChunksMap);
    {
        vector<TagsType> collectionTags;
        Status status =
            grid.catalogManager(txn)->getTagsForCollection(txn, nss.ns(), &collectionTags);
        if (!status.isOK()) {
            return status;
        }

        for (const auto& tagInfo : collectionTags) {
            BSONObj min = cm->getShardKeyPattern().getKeyPattern().extendRangeBound(
                tagInfo.getMinKey(), false);

            if (!allChunkMinimums.count(min)) {
                // This tag falls somewhere at the middle of a chunk. Therefore we must skip
                // balancing this collection until it is split at the next iteration.
                //
                // TODO: We should be able to just skip chunks, which straddle tags and still make
                // some progress balancing.
                return {ErrorCodes::IllegalOperation,
                        str::stream()
                            << "Tag boundaries " << tagInfo.toString()
                            << " fall in the middle of an existing chunk. Balancing for collection "
                            << nss.ns()
                            << " will be postponed until the chunk is split appropriately."};
            }

            // TODO: TagRange contains all the information from TagsType except for the namespace,
            // so maybe the two can be merged at some point in order to avoid the transformation
            // below.
            if (!distStatus.addTagRange(TagRange(tagInfo.getMinKey().getOwned(),
                                                 tagInfo.getMaxKey().getOwned(),
                                                 tagInfo.getTag()))) {
                return {ErrorCodes::BadValue,
                        str::stream() << "Tag ranges are not valid for collection " << nss.ns()
                                      << ". Balancing for this collection will be skipped until "
                                         "the ranges are fixed."};
            }
        }
    }

    unique_ptr<MigrateInfo> migrateInfo(
        BalancerPolicy::balance(nss.ns(), distStatus, aggressiveBalanceHint));
    if (migrateInfo) {
        return MigrateInfoVector{*migrateInfo};
    }

    return MigrateInfoVector{};
}