Beispiel #1
0
void ChunkManager::createFirstChunks(OperationContext* txn,
                                     const ShardId& primaryShardId,
                                     const vector<BSONObj>* initPoints,
                                     const set<ShardId>* initShardIds) {
    // TODO distlock?
    // TODO: Race condition if we shard the collection and insert data while we split across
    // the non-primary shard.

    vector<BSONObj> splitPoints;
    vector<ShardId> shardIds;
    calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds);


    // this is the first chunk; start the versioning from scratch
    ChunkVersion version;
    version.incEpoch();
    version.incMajor();

    log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns
          << " using new epoch " << version.epoch();

    for (unsigned i = 0; i <= splitPoints.size(); i++) {
        BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1];
        BSONObj max =
            i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax();

        Chunk temp(this, min, max, shardIds[i % shardIds.size()], version);

        BSONObjBuilder chunkBuilder;
        temp.serialize(chunkBuilder);

        BSONObj chunkObj = chunkBuilder.obj();

        Status result = grid.catalogManager(txn)->update(txn,
                                                         ChunkType::ConfigNS,
                                                         BSON(ChunkType::name(temp.genID())),
                                                         chunkObj,
                                                         true,
                                                         false,
                                                         NULL);

        version.incMinor();

        if (!result.isOK()) {
            string ss = str::stream()
                << "creating first chunks failed. result: " << result.reason();
            error() << ss;
            msgasserted(15903, ss);
        }
    }

    _version = ChunkVersion(0, 0, version.epoch());
}
void ChunkManagerTargeter::noteStaleResponse(const ShardEndpoint& endpoint,
                                             const StaleConfigInfo& staleInfo) {
    dassert(!_needsTargetingRefresh);

    ChunkVersion remoteShardVersion;
    if (!staleInfo.getVersionWanted()) {
        // If we don't have a vWanted sent, assume the version is higher than our current version.
        remoteShardVersion = getShardVersion(*_routingInfo, endpoint.shardName);
        remoteShardVersion.incMajor();
    } else {
        remoteShardVersion = *staleInfo.getVersionWanted();
    }

    ShardVersionMap::iterator it = _remoteShardVersions.find(endpoint.shardName);
    if (it == _remoteShardVersions.end()) {
        _remoteShardVersions.insert(std::make_pair(endpoint.shardName, remoteShardVersion));
    } else {
        ChunkVersion& previouslyNotedVersion = it->second;
        if (previouslyNotedVersion.epoch() == remoteShardVersion.epoch()) {
            if (previouslyNotedVersion.isOlderThan(remoteShardVersion)) {
                previouslyNotedVersion = remoteShardVersion;
            }
        } else {
            // Epoch changed midway while applying the batch so set the version to something
            // unique
            // and non-existent to force a reload when refreshIsNeeded is called.
            previouslyNotedVersion = ChunkVersion::IGNORED();
        }
    }
}
Beispiel #3
0
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder,
                                       const NamespaceString& nss,
                                       ChunkVersion collectionVersion,
                                       const ConnectionString& configServerConnectionString,
                                       const ShardId& fromShardId,
                                       const ShardId& toShardId,
                                       const ChunkRange& range,
                                       ChunkVersion chunkVersion,
                                       int64_t maxChunkSizeBytes,
                                       const MigrationSecondaryThrottleOptions& secondaryThrottle,
                                       bool waitForDelete,
                                       bool takeDistLock) {
    invariant(builder->asTempObj().isEmpty());
    invariant(nss.isValid());

    builder->append(kMoveChunk, nss.ns());
    collectionVersion.appendForCommands(builder);
    builder->append(kEpoch, collectionVersion.epoch());
    builder->append(kConfigServerConnectionString, configServerConnectionString.toString());
    builder->append(kFromShardId, fromShardId.toString());
    builder->append(kToShardId, toShardId.toString());
    range.append(builder);
    chunkVersion.appendWithFieldForCommands(builder, kChunkVersion);
    builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes));
    secondaryThrottle.append(builder);
    builder->append(kWaitForDelete, waitForDelete);
    builder->append(kTakeDistLock, takeDistLock);
}
CollectionMetadata::CollectionMetadata(const BSONObj& keyPattern, ChunkVersion collectionVersion)
    : _collVersion(collectionVersion),
      _shardVersion(ChunkVersion(0, 0, collectionVersion.epoch())),
      _keyPattern(keyPattern.getOwned()),
      _pendingMap(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()),
      _chunksMap(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()),
      _rangesMap(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()) {}
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder,
                                       const NamespaceString& nss,
                                       ChunkVersion chunkVersion,
                                       const ConnectionString& configServerConnectionString,
                                       const ShardId& fromShardId,
                                       const ShardId& toShardId,
                                       const ChunkRange& range,
                                       int64_t maxChunkSizeBytes,
                                       const MigrationSecondaryThrottleOptions& secondaryThrottle,
                                       bool waitForDelete) {
    invariant(builder->asTempObj().isEmpty());
    invariant(nss.isValid());

    builder->append(kMoveChunk, nss.ns());
    chunkVersion.appendToCommand(builder);  // 3.4 shard compatibility
    builder->append(kEpoch, chunkVersion.epoch());
    // config connection string is included for 3.4 shard compatibility
    builder->append(kConfigServerConnectionString, configServerConnectionString.toString());
    builder->append(kFromShardId, fromShardId.toString());
    builder->append(kToShardId, toShardId.toString());
    range.append(builder);
    builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes));
    secondaryThrottle.append(builder);
    builder->append(kWaitForDelete, waitForDelete);
    builder->append(kTakeDistLock, false);
}
std::unique_ptr<CollectionMetadata> CollectionMetadata::cloneMigrate(
    const ChunkType& chunk, const ChunkVersion& newCollectionVersion) const {
    invariant(newCollectionVersion.epoch() == _collVersion.epoch());
    invariant(newCollectionVersion > _collVersion);
    invariant(rangeMapContains(_chunksMap, chunk.getMin(), chunk.getMax()));

    unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>());
    metadata->_keyPattern = _keyPattern.getOwned();
    metadata->fillKeyPatternFields();
    metadata->_pendingMap = _pendingMap;
    metadata->_chunksMap = _chunksMap;
    metadata->_chunksMap.erase(chunk.getMin());

    metadata->_shardVersion =
        (metadata->_chunksMap.empty() ? ChunkVersion(0, 0, newCollectionVersion.epoch())
                                      : newCollectionVersion);
    metadata->_collVersion = newCollectionVersion;
    metadata->fillRanges();

    invariant(metadata->isValid());
    return metadata;
}
Status onShardVersionMismatch(OperationContext* opCtx,
                              const NamespaceString& nss,
                              ChunkVersion shardVersionReceived,
                              bool forceRefreshFromThisThread) noexcept {
    invariant(!opCtx->lockState()->isLocked());
    invariant(!opCtx->getClient()->isInDirectClient());

    auto const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->canAcceptShardedCommands());

    LOG(2) << "Metadata refresh requested for " << nss.ns() << " at shard version "
           << shardVersionReceived;

    ShardingStatistics::get(opCtx).countStaleConfigErrors.addAndFetch(1);

    // Ensure any ongoing migrations have completed before trying to do the refresh. This wait is
    // just an optimization so that MongoS does not exhaust its maximum number of StaleConfig retry
    // attempts while the migration is being committed.
    try {
        auto& oss = OperationShardingState::get(opCtx);
        oss.waitForMigrationCriticalSectionSignal(opCtx);
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto currentShardVersion = [&] {
        AutoGetCollection autoColl(opCtx, nss, MODE_IS);
        const auto currentMetadata = CollectionShardingState::get(opCtx, nss)->getMetadata(opCtx);
        if (currentMetadata) {
            return currentMetadata->getShardVersion();
        }

        return ChunkVersion::UNSHARDED();
    }();

    if (currentShardVersion.epoch() == shardVersionReceived.epoch() &&
        currentShardVersion.majorVersion() >= shardVersionReceived.majorVersion()) {
        // Don't need to remotely reload if we're in the same epoch and the requested version is
        // smaller than the one we know about. This means that the remote side is behind.
        return Status::OK();
    }

    try {
        forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread);
        return Status::OK();
    } catch (const DBException& ex) {
        log() << "Failed to refresh metadata for collection" << nss << causedBy(redact(ex));
        return ex.toStatus();
    }
}
unique_ptr<CollectionMetadata> CollectionMetadata::clonePlusChunk(
    const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion) const {
    invariant(newShardVersion.epoch() == _shardVersion.epoch());
    invariant(newShardVersion.isSet());
    invariant(minKey.woCompare(maxKey) < 0);
    invariant(!rangeMapOverlaps(_chunksMap, minKey, maxKey));

    unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>());
    metadata->_keyPattern = _keyPattern.getOwned();
    metadata->fillKeyPatternFields();
    metadata->_pendingMap = _pendingMap;
    metadata->_chunksMap = _chunksMap;
    metadata->_chunksMap.insert(make_pair(minKey.getOwned(), maxKey.getOwned()));
    metadata->_shardVersion = newShardVersion;
    metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : _collVersion;
    metadata->fillRanges();

    invariant(metadata->isValid());
    return metadata;
}
        /**
         * Stores ranges for a particular collection and shard starting from some version
         */
        void storeCollectionRanges( const NamespaceString& nss,
                                    const string& shardName,
                                    const vector<KeyRange>& ranges,
                                    const ChunkVersion& startVersion ) {

            // Get key pattern from first range
            ASSERT_GREATER_THAN( ranges.size(), 0u );

            CollectionType coll;
            coll.setNS( nss.ns() );
            coll.setKeyPattern( ranges.begin()->keyPattern );
            coll.setEpoch( startVersion.epoch() );
            coll.setUpdatedAt( 1ULL );
            string errMsg;
            ASSERT( coll.isValid( &errMsg ) );

            DBDirectClient client(&_txn);

            client.update( CollectionType::ConfigNS,
                           BSON( CollectionType::ns( coll.getNS() ) ),
                           coll.toBSON(), true, false );

            ChunkVersion nextVersion = startVersion;
            for ( vector<KeyRange>::const_iterator it = ranges.begin(); it != ranges.end(); ++it ) {

                ChunkType chunk;
                // TODO: We should not rely on the serialized ns, minkey being unique in the future,
                // causes problems since it links string serialization to correctness.
                chunk.setName( Chunk::genID( nss, it->minKey ) );
                chunk.setShard( shardName );
                chunk.setNS( nss.ns() );
                chunk.setVersion( nextVersion );
                chunk.setMin( it->minKey );
                chunk.setMax( it->maxKey );
                nextVersion.incMajor();

                client.insert( ChunkType::ConfigNS, chunk.toBSON() );
            }
        }
Beispiel #10
0
Status ShardingState::refreshMetadataIfNeeded(OperationContext* txn,
                                              const string& ns,
                                              const ChunkVersion& reqShardVersion,
                                              ChunkVersion* latestShardVersion) {
    // The _configServerTickets serializes this process such that only a small number of threads
    // can try to refresh at the same time.

    LOG(2) << "metadata refresh requested for " << ns << " at shard version " << reqShardVersion;

    //
    // Queuing of refresh requests starts here when remote reload is needed. This may take time.
    // TODO: Explicitly expose the queuing discipline.
    //

    _configServerTickets.waitForTicket();
    TicketHolderReleaser needTicketFrom(&_configServerTickets);

    //
    // Fast path - check if the requested version is at a higher version than the current
    // metadata version or a different epoch before verifying against config server.
    //

    shared_ptr<CollectionMetadata> storedMetadata;
    {
        stdx::lock_guard<stdx::mutex> lk(_mutex);
        CollectionMetadataMap::iterator it = _collMetadata.find(ns);
        if (it != _collMetadata.end())
            storedMetadata = it->second;
    }
    ChunkVersion storedShardVersion;
    if (storedMetadata)
        storedShardVersion = storedMetadata->getShardVersion();
    *latestShardVersion = storedShardVersion;

    if (storedShardVersion >= reqShardVersion &&
        storedShardVersion.epoch() == reqShardVersion.epoch()) {
        // Don't need to remotely reload if we're in the same epoch with a >= version
        return Status::OK();
    }

    //
    // Slow path - remotely reload
    //
    // Cases:
    // A) Initial config load and/or secondary take-over.
    // B) Migration TO this shard finished, notified by mongos.
    // C) Dropping a collection, notified (currently) by mongos.
    // D) Stale client wants to reload metadata with a different *epoch*, so we aren't sure.

    if (storedShardVersion.epoch() != reqShardVersion.epoch()) {
        // Need to remotely reload if our epochs aren't the same, to verify
        LOG(1) << "metadata change requested for " << ns << ", from shard version "
               << storedShardVersion << " to " << reqShardVersion
               << ", need to verify with config server";
    } else {
        // Need to remotely reload since our epochs aren't the same but our version is greater
        LOG(1) << "metadata version update requested for " << ns << ", from shard version "
               << storedShardVersion << " to " << reqShardVersion
               << ", need to verify with config server";
    }

    return doRefreshMetadata(txn, ns, reqShardVersion, true, latestShardVersion);
}
Beispiel #11
0
    bool mergeChunks( OperationContext* txn,
                      const NamespaceString& nss,
                      const BSONObj& minKey,
                      const BSONObj& maxKey,
                      const OID& epoch,
                      string* errMsg ) {

        //
        // Get sharding state up-to-date
        //

        ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(),
                                                              *errMsg );
        if ( !configLoc.isValid() ){
            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get the distributed lock
        //

        ScopedDistributedLock collLock( configLoc, nss.ns() );
        collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from "
                                          << minKey << " to " << maxKey );

        Status acquisitionStatus = collLock.tryAcquire();
        if (!acquisitionStatus.isOK()) {
            *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                               << " to merge chunks in [" << minKey << "," << maxKey << ")"
                               << causedBy(acquisitionStatus);

            warning() << *errMsg << endl;
            return false;
        }

        //
        // We now have the collection lock, refresh metadata to latest version and sanity check
        //

        ChunkVersion shardVersion;
        Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion);

        if ( !status.isOK() ) {

            *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                    << nss.ns() << causedBy( status.reason() );

            warning() << *errMsg << endl;
            return false;
        }

        if ( epoch.isSet() && shardVersion.epoch() != epoch ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has changed" << " since merge was sent" << "(sent epoch : "
                               << epoch.toString()
                               << ", current epoch : " << shardVersion.epoch().toString() << ")";

            warning() << *errMsg << endl;
            return false;
        }

        CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() );

        if ( !metadata || metadata->getKeyPattern().isEmpty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " is not sharded";

            warning() << *errMsg << endl;
            return false;
        }

        dassert( metadata->getShardVersion().equals( shardVersion ) );

        if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) {

            *errMsg = stream() << "could not merge chunks, the range "
                               << rangeToString( minKey, maxKey ) << " is not valid"
                               << " for collection " << nss.ns() << " with key pattern "
                               << metadata->getKeyPattern();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get merged chunk information
        //

        ChunkVersion mergeVersion = metadata->getCollVersion();
        mergeVersion.incMinor();

        OwnedPointerVector<ChunkType> chunksToMerge;

        ChunkType itChunk;
        itChunk.setMin( minKey );
        itChunk.setMax( minKey );
        itChunk.setNS( nss.ns() );
        itChunk.setShard( shardingState.getShardName() );

        while ( itChunk.getMax().woCompare( maxKey ) < 0 &&
                metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) {
            auto_ptr<ChunkType> saved( new ChunkType );
            itChunk.cloneTo( saved.get() );
            chunksToMerge.mutableVector().push_back( saved.release() );
        }

        if ( chunksToMerge.empty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " and ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Validate the range starts and ends at chunks and has no holes, error if not valid
        //

        BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin();
        BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax();
        // minKey is inclusive
        bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey );

        if ( !minKeyInRange ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin();
        BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax();
        // maxKey is exclusive
        bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 &&
                lastDocMax.woCompare( maxKey ) >= 0;

        if ( !maxKeyInRange ) {
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0;
        bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0;

        if ( !validRangeStartKey || !validRangeEndKey ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " does not contain a chunk "
                               << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" )
                               << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" )
                               << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" );

            warning() << *errMsg << endl;
            return false;
        }

        if ( chunksToMerge.size() == 1 ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " already contains chunk for " << rangeToString( minKey, maxKey );

            warning() << *errMsg << endl;
            return false;
        }

        bool holeInRange = false;

        // Look for hole in range
        ChunkType* prevChunk = *chunksToMerge.begin();
        ChunkType* nextChunk = NULL;
        for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin();
                it != chunksToMerge.end(); ++it ) {
            if ( it == chunksToMerge.begin() ) continue;

            nextChunk = *it;
            if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) {
                holeInRange = true;
                break;
            }
            prevChunk = nextChunk;
        }

        if ( holeInRange ) {

            dassert( NULL != nextChunk );
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has a hole in the range " << rangeToString( minKey, maxKey )
                               << " at " << rangeToString( prevChunk->getMax(),
                                                           nextChunk->getMin() );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Run apply ops command
        //

        BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge,
                                                shardVersion,
                                                mergeVersion );

        bool ok;
        BSONObj result;
        try {
            ScopedDbConnection conn( configLoc, 30.0 );
            ok = conn->runCommand( "config", applyOpsCmd, result );
            if ( !ok ) *errMsg = result.toString();
            conn.done();
        }
        catch( const DBException& ex ) {
            ok = false;
            *errMsg = ex.toString();
        }

        if ( !ok ) {
            *errMsg = stream() << "could not merge chunks for " << nss.ns()
                               << ", writing to config failed" << causedBy( errMsg );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Install merged chunk metadata
        //

        {
            Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X);
            shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
        }

        //
        // Log change
        //

        BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge,
                                                    shardVersion,
                                                    mergeVersion );

        configServer.logChange( "merge", nss.ns(), mergeLogEntry );

        return true;
    }
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneMerge(
    const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion) const {
    invariant(newShardVersion.epoch() == _shardVersion.epoch());
    invariant(newShardVersion > _shardVersion);

    RangeVector overlap;
    getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap);

    if (overlap.empty() || overlap.size() == 1) {
        return {ErrorCodes::IllegalOperation,
                stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                         << (overlap.empty() ? ", no chunks found in this range"
                                             : ", only one chunk found in this range")};
    }

    bool validStartEnd = true;
    bool validNoHoles = true;

    if (overlap.begin()->first.woCompare(minKey) != 0) {
        // First chunk doesn't start with minKey
        validStartEnd = false;
    } else if (overlap.rbegin()->second.woCompare(maxKey) != 0) {
        // Last chunk doesn't end with maxKey
        validStartEnd = false;
    } else {
        // Check that there are no holes
        BSONObj prevMaxKey = minKey;
        for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) {
            if (it->first.woCompare(prevMaxKey) != 0) {
                validNoHoles = false;
                break;
            }
            prevMaxKey = it->second;
        }
    }

    if (!validStartEnd || !validNoHoles) {
        return {ErrorCodes::IllegalOperation,
                stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                         << ", overlapping chunks " << overlapToString(overlap)
                         << (!validStartEnd ? " do not have the same min and max key"
                                            : " are not all adjacent")};
    }

    unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>());
    metadata->_keyPattern = _keyPattern.getOwned();
    metadata->fillKeyPatternFields();
    metadata->_pendingMap = _pendingMap;
    metadata->_chunksMap = _chunksMap;
    metadata->_rangesMap = _rangesMap;
    metadata->_shardVersion = newShardVersion;
    metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion;

    for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) {
        metadata->_chunksMap.erase(it->first);
    }

    metadata->_chunksMap.insert(make_pair(minKey, maxKey));

    invariant(metadata->isValid());
    return std::move(metadata);
}
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneSplit(
    const BSONObj& minKey,
    const BSONObj& maxKey,
    const std::vector<BSONObj>& splitKeys,
    const ChunkVersion& newShardVersion) const {
    invariant(newShardVersion.epoch() == _shardVersion.epoch());
    invariant(newShardVersion > _shardVersion);

    // The version required in both resulting chunks could be simply an increment in the
    // minor portion of the current version.  However, we are enforcing uniqueness over the
    // attributes <ns, version> of the configdb collection 'chunks'.  So in practice, a
    // migrate somewhere may force this split to pick up a version that has the major
    // portion higher than the one that this shard has been using.
    //
    // TODO drop the uniqueness constraint and tighten the check below so that only the
    // minor portion of version changes

    // Check that we have the exact chunk that will be subtracted.
    if (!rangeMapContains(_chunksMap, minKey, maxKey)) {
        stream errMsg;
        errMsg << "cannot split chunk " << rangeToString(minKey, maxKey)
               << ", this shard does not contain the chunk";

        if (rangeMapOverlaps(_chunksMap, minKey, maxKey)) {
            RangeVector overlap;
            getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap);

            errMsg << " and it overlaps " << overlapToString(overlap);
        }

        return {ErrorCodes::IllegalOperation, errMsg};
    }

    unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>());
    metadata->_keyPattern = _keyPattern.getOwned();
    metadata->fillKeyPatternFields();
    metadata->_pendingMap = _pendingMap;
    metadata->_chunksMap = _chunksMap;
    metadata->_shardVersion = newShardVersion;  // will increment 2nd, 3rd,... chunks below

    BSONObj startKey = minKey;
    for (const auto& split : splitKeys) {
        // Check that the split key is valid
        if (!rangeContains(minKey, maxKey, split)) {
            return {ErrorCodes::IllegalOperation,
                    stream() << "cannot split chunk " << rangeToString(minKey, maxKey) << " at key "
                             << split};
        }

        // Check that the split keys are in order
        if (split.woCompare(startKey) <= 0) {
            // The split keys came in out of order, this probably indicates a bug, so fail the
            // operation. Re-iterate splitKeys to build a useful error message including the array
            // of splitKeys in the order received.
            str::stream errMsg;
            errMsg << "Invalid input to splitChunk, split keys must be in order, got: [";
            for (auto it2 = splitKeys.cbegin(); it2 != splitKeys.cend(); ++it2) {
                if (it2 != splitKeys.begin()) {
                    errMsg << ", ";
                }
                errMsg << it2->toString();
            }
            errMsg << "]";
            return {ErrorCodes::IllegalOperation, errMsg};
        }

        metadata->_chunksMap[startKey] = split.getOwned();
        metadata->_chunksMap.insert(make_pair(split.getOwned(), maxKey.getOwned()));
        metadata->_shardVersion.incMinor();
        startKey = split;
    }

    metadata->_collVersion =
        metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion;
    metadata->fillRanges();

    invariant(metadata->isValid());
    return std::move(metadata);
}
Beispiel #14
0
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints(
    OperationContext* opCtx,
    const ShardId& shardId,
    const NamespaceString& nss,
    const ShardKeyPattern& shardKeyPattern,
    ChunkVersion collectionVersion,
    const ChunkRange& chunkRange,
    const std::vector<BSONObj>& splitPoints) {
    invariant(!splitPoints.empty());

    const size_t kMaxSplitPoints = 8192;

    if (splitPoints.size() > kMaxSplitPoints) {
        return {ErrorCodes::BadValue,
                str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints
                              << " parts at a time."};
    }

    // Sanity check that we are not attempting to split at the boundaries of the chunk. This check
    // is already performed at chunk split commit time, but we are performing it here for parity
    // with old auto-split code, which might rely on it.
    if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMin() == splitPoints.front())) {
        const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString()
                                            << ", split point "
                                            << splitPoints.front()
                                            << " is exactly on chunk bounds");
        return {ErrorCodes::CannotSplit, msg};
    }

    if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMax() == splitPoints.back())) {
        const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString()
                                            << ", split point "
                                            << splitPoints.back()
                                            << " is exactly on chunk bounds");
        return {ErrorCodes::CannotSplit, msg};
    }

    BSONObjBuilder cmd;
    cmd.append("splitChunk", nss.ns());
    cmd.append("from", shardId.toString());
    cmd.append("keyPattern", shardKeyPattern.toBSON());
    cmd.append("epoch", collectionVersion.epoch());
    collectionVersion.appendForCommands(&cmd);  // backwards compatibility with v3.4
    chunkRange.append(&cmd);
    cmd.append("splitKeys", splitPoints);

    BSONObj cmdObj = cmd.obj();

    Status status{ErrorCodes::InternalError, "Uninitialized value"};
    BSONObj cmdResponse;

    auto shardStatus = Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardId);
    if (!shardStatus.isOK()) {
        status = shardStatus.getStatus();
    } else {
        auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts(
            opCtx,
            ReadPreferenceSetting{ReadPreference::PrimaryOnly},
            "admin",
            cmdObj,
            Shard::RetryPolicy::kNotIdempotent);
        if (!cmdStatus.isOK()) {
            status = std::move(cmdStatus.getStatus());
        } else {
            status = std::move(cmdStatus.getValue().commandStatus);
            cmdResponse = std::move(cmdStatus.getValue().response);
        }
    }

    if (!status.isOK()) {
        log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status));
        return {status.code(), str::stream() << "split failed due to " << status.toString()};
    }

    BSONElement shouldMigrateElement;
    status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement);
    if (status.isOK()) {
        auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject());
        if (!chunkRangeStatus.isOK()) {
            return chunkRangeStatus.getStatus();
        }

        return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue()));
    } else if (status != ErrorCodes::NoSuchKey) {
        warning()
            << "Chunk migration will be skipped because splitChunk returned invalid response: "
            << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed"
            << causedBy(redact(status));
    }

    return boost::optional<ChunkRange>();
}
Beispiel #15
0
Status ShardingState::doRefreshMetadata(OperationContext* txn,
                                        const string& ns,
                                        const ChunkVersion& reqShardVersion,
                                        bool useRequestedVersion,
                                        ChunkVersion* latestShardVersion) {
    // The idea here is that we're going to reload the metadata from the config server, but
    // we need to do so outside any locks.  When we get our result back, if the current metadata
    // has changed, we may not be able to install the new metadata.

    //
    // Get the initial metadata
    // No DBLock is needed since the metadata is expected to change during reload.
    //

    shared_ptr<CollectionMetadata> beforeMetadata;

    {
        stdx::lock_guard<stdx::mutex> lk(_mutex);

        // We can't reload if sharding is not enabled - i.e. without a config server location
        if (!_enabled) {
            string errMsg = str::stream() << "cannot refresh metadata for " << ns
                                          << " before sharding has been enabled";

            warning() << errMsg;
            return Status(ErrorCodes::NotYetInitialized, errMsg);
        }

        // We also can't reload if a shard name has not yet been set.
        if (_shardName.empty()) {
            string errMsg = str::stream() << "cannot refresh metadata for " << ns
                                          << " before shard name has been set";

            warning() << errMsg;
            return Status(ErrorCodes::NotYetInitialized, errMsg);
        }

        CollectionMetadataMap::iterator it = _collMetadata.find(ns);
        if (it != _collMetadata.end()) {
            beforeMetadata = it->second;
        }
    }

    ChunkVersion beforeShardVersion;
    ChunkVersion beforeCollVersion;
    if (beforeMetadata) {
        beforeShardVersion = beforeMetadata->getShardVersion();
        beforeCollVersion = beforeMetadata->getCollVersion();
    }

    *latestShardVersion = beforeShardVersion;

    //
    // Determine whether we need to diff or fully reload
    //

    bool fullReload = false;
    if (!beforeMetadata) {
        // We don't have any metadata to reload from
        fullReload = true;
    } else if (useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch()) {
        // It's not useful to use the metadata as a base because we think the epoch will differ
        fullReload = true;
    }

    //
    // Load the metadata from the remote server, start construction
    //

    LOG(0) << "remotely refreshing metadata for " << ns
           << (useRequestedVersion
                   ? string(" with requested shard version ") + reqShardVersion.toString()
                   : "")
           << (fullReload ? ", current shard version is " : " based on current shard version ")
           << beforeShardVersion << ", current metadata version is " << beforeCollVersion;

    string errMsg;

    MetadataLoader mdLoader;
    CollectionMetadata* remoteMetadataRaw = new CollectionMetadata();
    shared_ptr<CollectionMetadata> remoteMetadata(remoteMetadataRaw);

    Timer refreshTimer;
    Status status = mdLoader.makeCollectionMetadata(grid.catalogManager(),
                                                    ns,
                                                    getShardName(),
                                                    fullReload ? NULL : beforeMetadata.get(),
                                                    remoteMetadataRaw);
    long long refreshMillis = refreshTimer.millis();

    if (status.code() == ErrorCodes::NamespaceNotFound) {
        remoteMetadata.reset();
        remoteMetadataRaw = NULL;
    } else if (!status.isOK()) {
        warning() << "could not remotely refresh metadata for " << ns << causedBy(status.reason());

        return status;
    }

    ChunkVersion remoteShardVersion;
    ChunkVersion remoteCollVersion;
    if (remoteMetadata) {
        remoteShardVersion = remoteMetadata->getShardVersion();
        remoteCollVersion = remoteMetadata->getCollVersion();
    }

    //
    // Get ready to install loaded metadata if needed
    //

    shared_ptr<CollectionMetadata> afterMetadata;
    ChunkVersion afterShardVersion;
    ChunkVersion afterCollVersion;
    ChunkVersion::VersionChoice choice;

    // If we choose to install the new metadata, this describes the kind of install
    enum InstallType {
        InstallType_New,
        InstallType_Update,
        InstallType_Replace,
        InstallType_Drop,
        InstallType_None
    } installType = InstallType_None;  // compiler complains otherwise

    {
        // Exclusive collection lock needed since we're now potentially changing the metadata,
        // and don't want reads/writes to be ongoing.
        ScopedTransaction transaction(txn, MODE_IX);
        Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX);
        Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X);

        //
        // Get the metadata now that the load has completed
        //

        stdx::lock_guard<stdx::mutex> lk(_mutex);

        // Don't reload if our config server has changed or sharding is no longer enabled
        if (!_enabled) {
            string errMsg = str::stream() << "could not refresh metadata for " << ns
                                          << ", sharding is no longer enabled";

            warning() << errMsg;
            return Status(ErrorCodes::NotYetInitialized, errMsg);
        }

        CollectionMetadataMap::iterator it = _collMetadata.find(ns);
        if (it != _collMetadata.end())
            afterMetadata = it->second;

        if (afterMetadata) {
            afterShardVersion = afterMetadata->getShardVersion();
            afterCollVersion = afterMetadata->getCollVersion();
        }

        *latestShardVersion = afterShardVersion;

        //
        // Resolve newer pending chunks with the remote metadata, finish construction
        //

        status = mdLoader.promotePendingChunks(afterMetadata.get(), remoteMetadataRaw);

        if (!status.isOK()) {
            warning() << "remote metadata for " << ns
                      << " is inconsistent with current pending chunks"
                      << causedBy(status.reason());

            return status;
        }

        //
        // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest
        // Zero-epochs (sentinel value for "dropped" collections), are tested by
        // !epoch.isSet().
        //

        choice = ChunkVersion::chooseNewestVersion(
            beforeCollVersion, afterCollVersion, remoteCollVersion);

        if (choice == ChunkVersion::VersionChoice_Remote) {
            dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion);

            if (!afterCollVersion.epoch().isSet()) {
                // First metadata load
                installType = InstallType_New;
                dassert(it == _collMetadata.end());
                _collMetadata.insert(make_pair(ns, remoteMetadata));
            } else if (remoteCollVersion.epoch().isSet() &&
                       remoteCollVersion.epoch() == afterCollVersion.epoch()) {
                // Update to existing metadata
                installType = InstallType_Update;

                // Invariant: If CollMetadata was not found, version should be have been 0.
                dassert(it != _collMetadata.end());
                it->second = remoteMetadata;
            } else if (remoteCollVersion.epoch().isSet()) {
                // New epoch detected, replacing metadata
                installType = InstallType_Replace;

                // Invariant: If CollMetadata was not found, version should be have been 0.
                dassert(it != _collMetadata.end());
                it->second = remoteMetadata;
            } else {
                dassert(!remoteCollVersion.epoch().isSet());

                // Drop detected
                installType = InstallType_Drop;
                _collMetadata.erase(it);
            }

            *latestShardVersion = remoteShardVersion;
        }
    }
    // End _mutex
    // End DBWrite

    //
    // Do messaging based on what happened above
    //
    string localShardVersionMsg = beforeShardVersion.epoch() == afterShardVersion.epoch()
        ? afterShardVersion.toString()
        : beforeShardVersion.toString() + " / " + afterShardVersion.toString();

    if (choice == ChunkVersion::VersionChoice_Unknown) {
        string errMsg = str::stream()
            << "need to retry loading metadata for " << ns
            << ", collection may have been dropped or recreated during load"
            << " (loaded shard version : " << remoteShardVersion.toString()
            << ", stored shard versions : " << localShardVersionMsg << ", took " << refreshMillis
            << "ms)";

        warning() << errMsg;
        return Status(ErrorCodes::RemoteChangeDetected, errMsg);
    }

    if (choice == ChunkVersion::VersionChoice_Local) {
        LOG(0) << "metadata of collection " << ns
               << " already up to date (shard version : " << afterShardVersion.toString()
               << ", took " << refreshMillis << "ms)";
        return Status::OK();
    }

    dassert(choice == ChunkVersion::VersionChoice_Remote);

    switch (installType) {
        case InstallType_New:
            LOG(0) << "collection " << ns << " was previously unsharded"
                   << ", new metadata loaded with shard version " << remoteShardVersion;
            break;
        case InstallType_Update:
            LOG(0) << "updating metadata for " << ns << " from shard version "
                   << localShardVersionMsg << " to shard version " << remoteShardVersion;
            break;
        case InstallType_Replace:
            LOG(0) << "replacing metadata for " << ns << " at shard version "
                   << localShardVersionMsg << " with a new epoch (shard version "
                   << remoteShardVersion << ")";
            break;
        case InstallType_Drop:
            LOG(0) << "dropping metadata for " << ns << " at shard version " << localShardVersionMsg
                   << ", took " << refreshMillis << "ms";
            break;
        default:
            verify(false);
            break;
    }

    if (installType != InstallType_Drop) {
        LOG(0) << "collection version was loaded at version " << remoteCollVersion << ", took "
               << refreshMillis << "ms";
    }

    return Status::OK();
}
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request)
    : _args(std::move(request)), _startTime() {
    invariant(!txn->lockState()->isLocked());

    const auto& oss = OperationShardingState::get(txn);
    if (!oss.hasShardVersion()) {
        uasserted(ErrorCodes::InvalidOptions, "collection version is missing");
    }

    // Even though the moveChunk command transmits a value in the operation's shardVersion field,
    // this value does not actually contain the shard version, but the global collection version.
    const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss());

    log() << "Starting chunk migration for "
          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
          << " with expected collection version " << expectedCollectionVersion;

    // Now that the collection is locked, snapshot the metadata and fetch the latest versions
    ShardingState* const shardingState = ShardingState::get(txn);

    ChunkVersion shardVersion;

    Status refreshStatus =
        shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion);
    if (!refreshStatus.isOK()) {
        uasserted(refreshStatus.code(),
                  str::stream() << "cannot start migrate of chunk "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " due to "
                                << refreshStatus.toString());
    }

    if (shardVersion.majorVersion() == 0) {
        // If the major version is zero, this means we do not have any chunks locally to migrate in
        // the first place
        uasserted(ErrorCodes::IncompatibleShardingMetadata,
                  str::stream() << "cannot start migrate of chunk "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " with zero shard version");
    }

    // Snapshot the committed metadata from the time the migration starts
    {
        ScopedTransaction scopedXact(txn, MODE_IS);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

        auto css = CollectionShardingState::get(txn, _args.getNss());
        _committedMetadata = css->getMetadata();
    }

    const ChunkVersion collectionVersion = _committedMetadata->getCollVersion();

    if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) {
        throw SendStaleConfigException(
            _args.getNss().ns(),
            str::stream() << "cannot move chunk "
                          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                          << " because collection may have been dropped. "
                          << "current epoch: "
                          << collectionVersion.epoch()
                          << ", cmd epoch: "
                          << expectedCollectionVersion.epoch(),
            expectedCollectionVersion,
            collectionVersion);
    }

    // With nonzero shard version, we must have a coll version >= our shard version
    invariant(collectionVersion >= shardVersion);

    // With nonzero shard version, we must have a shard key
    invariant(!_committedMetadata->getKeyPattern().isEmpty());

    ChunkType origChunk;
    if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) {
        // If this assertion is hit, it means that whoever called the shard moveChunk command
        // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this
        // shard. It is a benign error and does not indicate data corruption.
        uasserted(40145,
                  str::stream() << "Chunk with bounds "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " is not owned by this shard.");
    }

    uassert(40146,
            str::stream() << "Unable to find chunk with the exact bounds "
                          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                          << " at collection version "
                          << collectionVersion.toString()
                          << ". This indicates corrupted metadata.",
            origChunk.getMin().woCompare(_args.getMinKey()) == 0 &&
                origChunk.getMax().woCompare(_args.getMaxKey()) == 0);
}
Beispiel #17
0
        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {

            // Steps
            // 1. check basic config
            // 2. extract params from command
            // 3. fast check
            // 4. slow check (LOCKS)
            
            // step 1

            lastError.disableForCommand();
            ShardedConnectionInfo* info = ShardedConnectionInfo::get( true );

            // make sure we have the mongos id for writebacks
            if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) 
                return false;

            bool authoritative = cmdObj.getBoolField( "authoritative" );
            
            // check config server is ok or enable sharding
            if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) )
                return false;

            // check shard name/hosts are correct
            if ( cmdObj["shard"].type() == String ) {
                shardingState.gotShardName( cmdObj["shard"].String() );
            }
            
            // Handle initial shard connection
            if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){

                result.append( "initialized", true );

                // Send back wire version to let mongos know what protocol we can speak
                result.append( "minWireVersion", minWireVersion );
                result.append( "maxWireVersion", maxWireVersion );

                return true;
            }

            // we can run on a slave up to here
            if ( ! isMaster( "admin" ) ) {
                result.append( "errmsg" , "not master" );
                result.append( "note" , "from post init in setShardVersion" );
                return false;
            }

            // step 2
            
            string ns = cmdObj["setShardVersion"].valuestrsafe();
            if ( ns.size() == 0 ) {
                errmsg = "need to specify namespace";
                return false;
            }

            if( ! ChunkVersion::canParseBSON( cmdObj, "version" ) ){
                errmsg = "need to specify version";
                return false;
            }

            const ChunkVersion version = ChunkVersion::fromBSON( cmdObj, "version" );
            
            // step 3

            const ChunkVersion oldVersion = info->getVersion(ns);
            const ChunkVersion globalVersion = shardingState.getVersion(ns);

            oldVersion.addToBSON( result, "oldVersion" );
            
            if ( globalVersion.isSet() && version.isSet() ) {
                // this means there is no reset going on an either side
                // so its safe to make some assumptions

                if ( version.isWriteCompatibleWith( globalVersion ) ) {
                    // mongos and mongod agree!
                    if ( ! oldVersion.isWriteCompatibleWith( version ) ) {
                        if ( oldVersion < globalVersion &&
                             oldVersion.hasCompatibleEpoch(globalVersion) )
                        {
                            info->setVersion( ns , version );
                        }
                        else if ( authoritative ) {
                            // this means there was a drop and our version is reset
                            info->setVersion( ns , version );
                        }
                        else {
                            result.append( "ns" , ns );
                            result.appendBool( "need_authoritative" , true );
                            errmsg = "verifying drop on '" + ns + "'";
                            return false;
                        }
                    }
                    return true;
                }
                
            }

            // step 4
            
            // this is because of a weird segfault I saw and I can't see why this should ever be set
            massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); 
        
            if ( oldVersion.isSet() && ! globalVersion.isSet() ) {
                // this had been reset
                info->setVersion( ns , ChunkVersion( 0, OID() ) );
            }

            if ( ! version.isSet() && ! globalVersion.isSet() ) {
                // this connection is cleaning itself
                info->setVersion( ns , ChunkVersion( 0, OID() ) );
                return true;
            }

            // Cases below all either return OR fall-through to remote metadata reload.
            if ( version.isSet() || !globalVersion.isSet() ) {

                // Not Dropping

                // TODO: Refactor all of this
                if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) {
                    errmsg = "this connection already had a newer version of collection '" + ns + "'";
                    result.append( "ns" , ns );
                    version.addToBSON( result, "newVersion" );
                    globalVersion.addToBSON( result, "globalVersion" );
                    return false;
                }

                // TODO: Refactor all of this
                if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) {
                    while ( shardingState.inCriticalMigrateSection() ) {
                        log() << "waiting till out of critical section" << endl;
                        shardingState.waitTillNotInCriticalSection( 10 );
                    }
                    errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'";
                    result.append( "ns" , ns );
                    version.addToBSON( result, "version" );
                    globalVersion.addToBSON( result, "globalVersion" );
                    result.appendBool( "reloadConfig" , true );
                    return false;
                }

                if ( ! globalVersion.isSet() && ! authoritative ) {
                    // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which
                    // should require a reload.
                    while ( shardingState.inCriticalMigrateSection() ) {
                        log() << "waiting till out of critical section" << endl;
                        shardingState.waitTillNotInCriticalSection( 10 );
                    }

                    // need authoritative for first look
                    result.append( "ns" , ns );
                    result.appendBool( "need_authoritative" , true );
                    errmsg = "first time for collection '" + ns + "'";
                    return false;
                }

                // Fall through to metadata reload below
            }
            else {

                // Dropping

                if ( ! authoritative ) {
                    result.appendBool( "need_authoritative" , true );
                    result.append( "ns" , ns );
                    globalVersion.addToBSON( result, "globalVersion" );
                    errmsg = "dropping needs to be authoritative";
                    return false;
                }

                // Fall through to metadata reload below
            }

            ChunkVersion currVersion;
            Status status = shardingState.refreshMetadataIfNeeded( ns, version, &currVersion );

            if (!status.isOK()) {

                // The reload itself was interrupted or confused here

                errmsg = str::stream() << "could not refresh metadata for " << ns
                                       << " with requested shard version " << version.toString()
                                       << ", stored shard version is " << currVersion.toString()
                                       << causedBy( status.reason() );

                warning() << errmsg << endl;

                result.append( "ns" , ns );
                version.addToBSON( result, "version" );
                currVersion.addToBSON( result, "globalVersion" );
                result.appendBool( "reloadConfig", true );

                return false;
            }
            else if ( !version.isWriteCompatibleWith( currVersion ) ) {

                // We reloaded a version that doesn't match the version mongos was trying to
                // set.

                errmsg = str::stream() << "requested shard version differs from"
                                       << " config shard version for " << ns
                                       << ", requested version is " << version.toString()
                                       << " but found version " << currVersion.toString();

                OCCASIONALLY warning() << errmsg << endl;

                // WARNING: the exact fields below are important for compatibility with mongos
                // version reload.

                result.append( "ns" , ns );
                currVersion.addToBSON( result, "globalVersion" );

                // If this was a reset of a collection or the last chunk moved out, inform mongos to
                // do a full reload.
                if (currVersion.epoch() != version.epoch() || !currVersion.isSet() ) {
                    result.appendBool( "reloadConfig", true );
                    // Zero-version also needed to trigger full mongos reload, sadly
                    // TODO: Make this saner, and less impactful (full reload on last chunk is bad)
                    ChunkVersion( 0, 0, OID() ).addToBSON( result, "version" );
                    // For debugging
                    version.addToBSON( result, "origVersion" );
                }
                else {
                    version.addToBSON( result, "version" );
                }

                return false;
            }

            info->setVersion( ns , version );
            return true;
        }
Beispiel #18
0
    Status ShardingState::doRefreshMetadata( const string& ns,
                                             const ChunkVersion& reqShardVersion,
                                             bool useRequestedVersion,
                                             ChunkVersion* latestShardVersion )
    {
        // The idea here is that we're going to reload the metadata from the config server, but
        // we need to do so outside any locks.  When we get our result back, if the current metadata
        // has changed, we may not be able to install the new metadata.

        //
        // Get the initial metadata
        // No DBLock is needed since the metadata is expected to change during reload.
        //

        CollectionMetadataPtr beforeMetadata;
        string shardName;
        {
            scoped_lock lk( _mutex );
            CollectionMetadataMap::iterator it = _collMetadata.find( ns );
            if ( it != _collMetadata.end() ) beforeMetadata = it->second;
            shardName = _shardName;
        }

        ChunkVersion beforeShardVersion;
        ChunkVersion beforeCollVersion;
        if ( beforeMetadata ) {
            beforeShardVersion = beforeMetadata->getShardVersion();
            beforeCollVersion = beforeMetadata->getCollVersion();
        }

        *latestShardVersion = beforeShardVersion;

        // We can't reload without a shard name.  Must check here before loading, since shard name
        // may have changed if we checked it earlier and released the _mutex.
        if ( shardName.empty() ) {

            string errMsg = str::stream() << "cannot refresh metadata for " << ns
                                          << " before shard name has been set";

            LOG( 0 ) << errMsg << endl;
            return Status( ErrorCodes::IllegalOperation, errMsg );
        }

        //
        // Determine whether we need to diff or fully reload
        //

        bool fullReload = false;
        if ( !beforeMetadata ) {
            // We don't have any metadata to reload from
            fullReload = true;
        }
        else if ( useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch() ) {
            // It's not useful to use the metadata as a base because we think the epoch will differ
            fullReload = true;
        }

        //
        // Load the metadata from the remote server, start construction
        //

        LOG( 0 ) << "remotely refreshing metadata for " << ns
                 << ( useRequestedVersion ?
                      string( " with requested shard version " ) + reqShardVersion.toString() : "" )
                 << ( fullReload ?
                      ", current shard version is " : " based on current shard version " )
                 << beforeShardVersion
                 << ", current metadata version is " << beforeCollVersion << endl;

        string errMsg;
        ConnectionString configServerLoc = ConnectionString::parse( _configServer, errMsg );
        MetadataLoader mdLoader( configServerLoc );
        CollectionMetadata* remoteMetadataRaw = new CollectionMetadata();
        CollectionMetadataPtr remoteMetadata( remoteMetadataRaw );

        Timer refreshTimer;
        Status status =
                mdLoader.makeCollectionMetadata( ns,
                                                 shardName,
                                                 ( fullReload ? NULL : beforeMetadata.get() ),
                                                 remoteMetadataRaw );
        long long refreshMillis = refreshTimer.millis();

        if ( status.code() == ErrorCodes::NamespaceNotFound ) {
            remoteMetadata.reset();
            remoteMetadataRaw = NULL;
        }
        else if ( !status.isOK() ) {

            warning() << "could not remotely refresh metadata for " << ns
                      << causedBy( status.reason() ) << endl;

            return status;
        }

        ChunkVersion remoteShardVersion;
        ChunkVersion remoteCollVersion;
        if ( remoteMetadata ) {
            remoteShardVersion = remoteMetadata->getShardVersion();
            remoteCollVersion = remoteMetadata->getCollVersion();
        }

        //
        // Get ready to install loaded metadata if needed
        //

        CollectionMetadataPtr afterMetadata;
        ChunkVersion afterShardVersion;
        ChunkVersion afterCollVersion;
        ChunkVersion::VersionChoice choice;

        // If we choose to install the new metadata, this describes the kind of install
        enum InstallType {
            InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop,
            InstallType_None
        } installType = InstallType_None; // compiler complains otherwise

        {
            // DBLock needed since we're now potentially changing the metadata, and don't want
            // reads/writes to be ongoing.
            Lock::DBWrite writeLk( ns );

            //
            // Get the metadata now that the load has completed
            //

            scoped_lock lk( _mutex );
            CollectionMetadataMap::iterator it = _collMetadata.find( ns );
            if ( it != _collMetadata.end() ) afterMetadata = it->second;

            if ( afterMetadata ) {
                afterShardVersion = afterMetadata->getShardVersion();
                afterCollVersion = afterMetadata->getCollVersion();
            }

            *latestShardVersion = afterShardVersion;
            //
            // Resolve newer pending chunks with the remote metadata, finish construction
            //

            status = mdLoader.promotePendingChunks( afterMetadata.get(), remoteMetadataRaw );

            if ( !status.isOK() ) {

                warning() << "remote metadata for " << ns
                          << " is inconsistent with current pending chunks"
                          << causedBy( status.reason() ) << endl;

                return status;
            }

            //
            // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest
            // Zero-epochs (sentinel value for "dropped" collections), are tested by
            // !epoch.isSet().
            //

            choice = ChunkVersion::chooseNewestVersion( beforeCollVersion,
                                                        afterCollVersion,
                                                        remoteCollVersion );

            if ( choice == ChunkVersion::VersionChoice_Remote ) {
                dassert(!remoteCollVersion.epoch().isSet() ||
                        remoteShardVersion >= beforeShardVersion);

                if ( !afterCollVersion.epoch().isSet() ) {

                    // First metadata load
                    installType = InstallType_New;
                    dassert( it == _collMetadata.end() );
                    _collMetadata.insert( make_pair( ns, remoteMetadata ) );
                }
                else if ( remoteCollVersion.epoch().isSet() &&
                          remoteCollVersion.epoch() == afterCollVersion.epoch() ) {

                    // Update to existing metadata
                    installType = InstallType_Update;

                    // Invariant: If CollMetadata was not found, version should be have been 0.
                    dassert( it != _collMetadata.end() );
                    it->second = remoteMetadata;
                }
                else if ( remoteCollVersion.epoch().isSet() ) {

                    // New epoch detected, replacing metadata
                    installType = InstallType_Replace;

                    // Invariant: If CollMetadata was not found, version should be have been 0.
                    dassert( it != _collMetadata.end() );
                    it->second = remoteMetadata;
                }
                else {
                    dassert( !remoteCollVersion.epoch().isSet() );

                    // Drop detected
                    installType = InstallType_Drop;
                    _collMetadata.erase( it );
                }

                *latestShardVersion = remoteShardVersion;
            }
        }
        // End _mutex
        // End DBWrite

        //
        // Do messaging based on what happened above
        //

        string versionMsg = str::stream()
            << " (loaded metadata version : " << remoteCollVersion.toString()
            << ( beforeCollVersion.epoch() == afterCollVersion.epoch() ?
                     string( ", stored version : " ) + afterCollVersion.toString() :
                     string( ", stored versions : " ) +
                         beforeCollVersion.toString() + " / " + afterCollVersion.toString() )
            << ", took " << refreshMillis << "ms)";

        if ( choice == ChunkVersion::VersionChoice_Unknown ) {

            string errMsg =
                str::stream() << "need to retry loading metadata for " << ns
                              << ", collection may have been dropped or recreated during load"
                              << versionMsg;

            warning() << errMsg << endl;
            return Status( ErrorCodes::RemoteChangeDetected, errMsg );
        }

        if ( choice == ChunkVersion::VersionChoice_Local ) {

            LOG( 0 ) << "newer metadata not found for " << ns << versionMsg << endl;
            return Status::OK();
        }

        dassert( choice == ChunkVersion::VersionChoice_Remote );

        switch( installType ) {
        case InstallType_New:
            LOG( 0 ) << "loaded new metadata for " << ns << versionMsg << endl;
            break;
        case InstallType_Update:
            LOG( 0 ) << "loaded newer metadata for " << ns << versionMsg << endl;
            break;
        case InstallType_Replace:
            LOG( 0 ) << "replacing metadata for " << ns << versionMsg << endl;
            break;
        case InstallType_Drop:
            LOG( 0 ) << "dropping metadata for " << ns << versionMsg << endl;
            break;
        default:
            verify( false );
            break;
        }

        return Status::OK();
    }
Beispiel #19
0
bool mergeChunks(OperationContext* txn,
                 const NamespaceString& nss,
                 const BSONObj& minKey,
                 const BSONObj& maxKey,
                 const OID& epoch,
                 string* errMsg) {
    // Get the distributed lock
    string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to "
                                 << maxKey;
    auto scopedDistLock = grid.catalogManager(txn)->distLock(
        txn, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout);

    if (!scopedDistLock.isOK()) {
        *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                           << " to merge chunks in [" << minKey << "," << maxKey << ")"
                           << causedBy(scopedDistLock.getStatus());

        warning() << *errMsg;
        return false;
    }

    ShardingState* shardingState = ShardingState::get(txn);

    //
    // We now have the collection lock, refresh metadata to latest version and sanity check
    //

    ChunkVersion shardVersion;
    Status status = shardingState->refreshMetadataNow(txn, nss.ns(), &shardVersion);

    if (!status.isOK()) {
        *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                << nss.ns() << causedBy(status.reason());

        warning() << *errMsg;
        return false;
    }

    if (epoch.isSet() && shardVersion.epoch() != epoch) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed"
                           << " since merge was sent"
                           << "(sent epoch : " << epoch.toString()
                           << ", current epoch : " << shardVersion.epoch().toString() << ")";

        warning() << *errMsg;
        return false;
    }

    shared_ptr<CollectionMetadata> metadata = shardingState->getCollectionMetadata(nss.ns());

    if (!metadata || metadata->getKeyPattern().isEmpty()) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " is not sharded";

        warning() << *errMsg;
        return false;
    }

    dassert(metadata->getShardVersion().equals(shardVersion));

    if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) {
        *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey)
                           << " is not valid"
                           << " for collection " << nss.ns() << " with key pattern "
                           << metadata->getKeyPattern();

        warning() << *errMsg;
        return false;
    }

    //
    // Get merged chunk information
    //

    ChunkVersion mergeVersion = metadata->getCollVersion();
    mergeVersion.incMinor();

    std::vector<ChunkType> chunksToMerge;

    ChunkType itChunk;
    itChunk.setMin(minKey);
    itChunk.setMax(minKey);
    itChunk.setNS(nss.ns());
    itChunk.setShard(shardingState->getShardName());

    while (itChunk.getMax().woCompare(maxKey) < 0 &&
           metadata->getNextChunk(itChunk.getMax(), &itChunk)) {
        chunksToMerge.push_back(itChunk);
    }

    if (chunksToMerge.empty()) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range starting at " << minKey << " and ending at " << maxKey
                           << " does not belong to shard " << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    //
    // Validate the range starts and ends at chunks and has no holes, error if not valid
    //

    BSONObj firstDocMin = chunksToMerge.front().getMin();
    BSONObj firstDocMax = chunksToMerge.front().getMax();
    // minKey is inclusive
    bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey);

    if (!minKeyInRange) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range starting at " << minKey << " does not belong to shard "
                           << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    BSONObj lastDocMin = chunksToMerge.back().getMin();
    BSONObj lastDocMax = chunksToMerge.back().getMax();
    // maxKey is exclusive
    bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0;

    if (!maxKeyInRange) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range ending at " << maxKey << " does not belong to shard "
                           << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0;
    bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0;

    if (!validRangeStartKey || !validRangeEndKey) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " does not contain a chunk "
                           << (!validRangeStartKey ? "starting at " + minKey.toString() : "")
                           << (!validRangeStartKey && !validRangeEndKey ? " or " : "")
                           << (!validRangeEndKey ? "ending at " + maxKey.toString() : "");

        warning() << *errMsg;
        return false;
    }

    if (chunksToMerge.size() == 1) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " already contains chunk for " << rangeToString(minKey, maxKey);

        warning() << *errMsg;
        return false;
    }

    // Look for hole in range
    for (size_t i = 1; i < chunksToMerge.size(); ++i) {
        if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) {
            *errMsg =
                stream() << "could not merge chunks, collection " << nss.ns()
                         << " has a hole in the range " << rangeToString(minKey, maxKey) << " at "
                         << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin());

            warning() << *errMsg;
            return false;
        }
    }

    //
    // Run apply ops command
    //
    Status applyOpsStatus = runApplyOpsCmd(txn, chunksToMerge, shardVersion, mergeVersion);
    if (!applyOpsStatus.isOK()) {
        warning() << applyOpsStatus;
        return false;
    }

    //
    // Install merged chunk metadata
    //

    {
        ScopedTransaction transaction(txn, MODE_IX);
        Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX);
        Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X);

        shardingState->mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
    }

    //
    // Log change
    //

    BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion);

    grid.catalogManager(txn)->logChange(txn, "merge", nss.ns(), mergeLogEntry);

    return true;
}