Beispiel #1
0
    Status enforceLegacyWriteConcern( MultiCommandDispatch* dispatcher,
                                      const StringData& dbName,
                                      const BSONObj& options,
                                      const HostOpTimeMap& hostOpTimes,
                                      vector<LegacyWCResponse>* legacyWCResponses ) {

        if ( hostOpTimes.empty() ) {
            return Status::OK();
        }

        for ( HostOpTimeMap::const_iterator it = hostOpTimes.begin(); it != hostOpTimes.end();
            ++it ) {

            const ConnectionString& shardEndpoint = it->first;
            const HostOpTime hot = it->second;
            const OpTime& opTime = hot.opTime;
            const OID& electionId = hot.electionId;

            LOG( 3 ) << "enforcing write concern " << options << " on " << shardEndpoint.toString()
                     << " at opTime " << opTime.toStringPretty() << " with electionID "
                     << electionId;

            BSONObj gleCmd = buildGLECmdWithOpTime( options, opTime, electionId );

            RawBSONSerializable gleCmdSerial( gleCmd );
            dispatcher->addCommand( shardEndpoint, dbName, gleCmdSerial );
        }

        dispatcher->sendAll();

        vector<Status> failedStatuses;

        while ( dispatcher->numPending() > 0 ) {

            ConnectionString shardEndpoint;
            RawBSONSerializable gleResponseSerial;

            Status dispatchStatus = dispatcher->recvAny( &shardEndpoint, &gleResponseSerial );
            if ( !dispatchStatus.isOK() ) {
                // We need to get all responses before returning
                failedStatuses.push_back( dispatchStatus );
                continue;
            }

            BSONObj gleResponse = BatchSafeWriter::stripNonWCInfo( gleResponseSerial.toBSON() );

            // Use the downconversion tools to determine if this GLE response is ok, a
            // write concern error, or an unknown error we should immediately abort for.
            BatchSafeWriter::GLEErrors errors;
            Status extractStatus = BatchSafeWriter::extractGLEErrors( gleResponse, &errors );
            if ( !extractStatus.isOK() ) {
                failedStatuses.push_back( extractStatus );
                continue;
            }

            LegacyWCResponse wcResponse;
            wcResponse.shardHost = shardEndpoint.toString();
            wcResponse.gleResponse = gleResponse;
            if ( errors.wcError.get() ) {
                wcResponse.errToReport = errors.wcError->getErrMessage();
            }

            legacyWCResponses->push_back( wcResponse );
        }

        if ( failedStatuses.empty() ) {
            return Status::OK();
        }

        StringBuilder builder;
        builder << "could not enforce write concern";

        for ( vector<Status>::const_iterator it = failedStatuses.begin();
            it != failedStatuses.end(); ++it ) {
            const Status& failedStatus = *it;
            if ( it == failedStatuses.begin() ) {
                builder << causedBy( failedStatus.toString() );
            }
            else {
                builder << ":: and ::" << failedStatus.toString();
            }
        }

        return Status( failedStatuses.size() == 1u ? failedStatuses.front().code() : 
                                                     ErrorCodes::MultipleErrorsOccurred, 
                       builder.str() );
    }
Beispiel #2
0
 void DBException::traceIfNeeded( const DBException& e ) {
     if( traceExceptions && ! inShutdown() ){
         warning() << "DBException thrown" << causedBy( e ) << endl;
         printStackTrace();
     }
 }
Beispiel #3
0
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx,
                                           ChunkManager* manager,
                                           Chunk* chunk,
                                           long dataWritten) {
    // Disable lastError tracking so that any errors, which occur during auto-split do not get
    // bubbled up on the client connection doing a write
    LastError::Disabled disableLastError(&LastError::get(opCtx->getClient()));

    const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration();

    const bool minIsInf =
        (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin()));
    const bool maxIsInf =
        (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax()));

    const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten);

    const uint64_t desiredChunkSize =
        calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks());

    if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) {
        return;
    }

    const NamespaceString nss(manager->getns());

    if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) {
        LOG(1) << "won't auto split because not enough tickets: " << nss;
        return;
    }

    TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets));

    const ChunkRange chunkRange(chunk->getMin(), chunk->getMax());

    try {
        // Ensure we have the most up-to-date balancer configuration
        uassertStatusOK(balancerConfig->refreshAndCheck(opCtx));

        if (!balancerConfig->getShouldAutoSplit()) {
            return;
        }

        LOG(1) << "about to initiate autosplit: " << redact(chunk->toString())
               << " dataWritten: " << chunkBytesWritten
               << " desiredChunkSize: " << desiredChunkSize;

        const uint64_t chunkSizeToUse = [&]() {
            const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2;

            if (estNumSplitPoints >= kTooManySplitPoints) {
                // The current desired chunk size will split the chunk into lots of small chunk and
                // at the worst case this can result into thousands of chunks. So check and see if a
                // bigger value can be used.
                return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes());
            } else {
                return desiredChunkSize;
            }
        }();

        auto splitPoints =
            uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx,
                                                              chunk->getShardId(),
                                                              nss,
                                                              manager->getShardKeyPattern(),
                                                              chunkRange,
                                                              chunkSizeToUse,
                                                              boost::none));

        if (splitPoints.size() <= 1) {
            // No split points means there isn't enough data to split on; 1 split point means we
            // have
            // between half the chunk size to full chunk size so there is no need to split yet
            chunk->clearBytesWritten();
            return;
        }

        if (minIsInf || maxIsInf) {
            // We don't want to reset _dataWritten since we want to check the other side right away
        } else {
            // We're splitting, so should wait a bit
            chunk->clearBytesWritten();
        }

        // We assume that if the chunk being split is the first (or last) one on the collection,
        // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the
        // very first (or last) key as a split point.
        //
        // This heuristic is skipped for "special" shard key patterns that are not likely to produce
        // monotonically increasing or decreasing values (e.g. hashed shard keys).
        if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) {
            if (minIsInf) {
                BSONObj key = findExtremeKeyForShard(
                    opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true);
                if (!key.isEmpty()) {
                    splitPoints.front() = key.getOwned();
                }
            } else if (maxIsInf) {
                BSONObj key = findExtremeKeyForShard(
                    opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false);
                if (!key.isEmpty()) {
                    splitPoints.back() = key.getOwned();
                }
            }
        }

        const auto suggestedMigrateChunk =
            uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx,
                                                                  chunk->getShardId(),
                                                                  nss,
                                                                  manager->getShardKeyPattern(),
                                                                  manager->getVersion(),
                                                                  chunkRange,
                                                                  splitPoints));

        // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk
        // to balance
        const bool shouldBalance = [&]() {
            if (!balancerConfig->shouldBalanceForAutoSplit())
                return false;

            auto collStatus =
                Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns());
            if (!collStatus.isOK()) {
                log() << "Auto-split for " << nss << " failed to load collection metadata"
                      << causedBy(redact(collStatus.getStatus()));
                return false;
            }

            return collStatus.getValue().value.getAllowBalance();
        }();

        log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into "
              << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")"
              << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" +
                          (shouldBalance ? ")" : ", but no migrations allowed)"));

        // Reload the chunk manager after the split
        auto routingInfo = uassertStatusOK(
            Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx,
                                                                                         nss));

        if (!shouldBalance || !suggestedMigrateChunk) {
            return;
        }

        // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot
        // spot from staying on a single shard. This is based on the assumption that succeeding
        // inserts will fall on the top chunk.

        // We need to use the latest chunk manager (after the split) in order to have the most
        // up-to-date view of the chunk we are about to move
        auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation(
            suggestedMigrateChunk->getMin());

        ChunkType chunkToMove;
        chunkToMove.setNS(nss.ns());
        chunkToMove.setShard(suggestedChunk->getShardId());
        chunkToMove.setMin(suggestedChunk->getMin());
        chunkToMove.setMax(suggestedChunk->getMax());
        chunkToMove.setVersion(suggestedChunk->getLastmod());

        uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove));

        // Ensure the collection gets reloaded because of the move
        Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss);
    } catch (const DBException& ex) {
        chunk->clearBytesWritten();

        if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) {
            log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex)
                  << ", going to invalidate routing table entry for " << nss;
            Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss);
        }
    }
}
Beispiel #4
0
    auto_ptr<DBClientCursor> DBClientReplicaSet::query(const string &ns, Query query, int nToReturn, int nToSkip,
            const BSONObj *fieldsToReturn, int queryOptions, int batchSize) {

        if ( queryOptions & QueryOption_SlaveOk ) {
            // we're ok sending to a slave
            // we'll try 2 slaves before just using master
            // checkSlave will try a different slave automatically after a failure
            for ( int i=0; i<3; i++ ) {
                try {
                    return checkSlaveQueryResult( checkSlave()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize) );
                }
                catch ( DBException &e ) {
                    LOG(1) << "can't query replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
                }
            }
        }

        return checkMaster()->query(ns,query,nToReturn,nToSkip,fieldsToReturn,queryOptions,batchSize);
    }
Beispiel #5
0
    void DBClientReplicaSet::say( Message& toSend, bool isRetry ) {

        if( ! isRetry )
            _lazyState = LazyState();

        int lastOp = -1;
        bool slaveOk = false;

        if ( ( lastOp = toSend.operation() ) == dbQuery ) {
            // TODO: might be possible to do this faster by changing api
            DbMessage dm( toSend );
            QueryMessage qm( dm );
            if ( ( slaveOk = ( qm.queryOptions & QueryOption_SlaveOk ) ) ) {

                for ( int i = _lazyState._retries; i < 3; i++ ) {
                    try {
                        DBClientConnection* slave = checkSlave();
                        slave->say( toSend );

                        _lazyState._lastOp = lastOp;
                        _lazyState._slaveOk = slaveOk;
                        _lazyState._retries = i;
                        _lazyState._lastClient = slave;
                        return;
                    }
                    catch ( DBException &e ) {
                       LOG(1) << "can't callLazy replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
                    }
                }
            }
        }

        DBClientConnection* master = checkMaster();
        master->say( toSend );

        _lazyState._lastOp = lastOp;
        _lazyState._slaveOk = slaveOk;
        _lazyState._retries = 3;
        _lazyState._lastClient = master;
        return;
    }
Beispiel #6
0
StatusWith<vector<MigrateInfo>> Balancer::_getCandidateChunks(OperationContext* txn) {
    vector<CollectionType> collections;

    Status collsStatus =
        grid.catalogManager(txn)->getCollections(txn, nullptr, &collections, nullptr);
    if (!collsStatus.isOK()) {
        return collsStatus;
    }

    if (collections.empty()) {
        return vector<MigrateInfo>();
    }

    // Get a list of all the shards that are participating in this balance round along with any
    // maximum allowed quotas and current utilization. We get the latter by issuing
    // db.serverStatus() (mem.mapped) to all shards.
    //
    // TODO: skip unresponsive shards and mark information as stale.
    auto shardInfoStatus = DistributionStatus::populateShardInfoMap(txn);
    if (!shardInfoStatus.isOK()) {
        return shardInfoStatus.getStatus();
    }

    const ShardInfoMap shardInfo(std::move(shardInfoStatus.getValue()));

    if (shardInfo.size() < 2) {
        return vector<MigrateInfo>();
    }

    OCCASIONALLY warnOnMultiVersion(shardInfo);

    std::vector<MigrateInfo> candidateChunks;

    // For each collection, check if the balancing policy recommends moving anything around.
    for (const auto& coll : collections) {
        // Skip collections for which balancing is disabled
        const NamespaceString& nss = coll.getNs();

        if (!coll.getAllowBalance()) {
            LOG(1) << "Not balancing collection " << nss << "; explicitly disabled.";
            continue;
        }

        std::vector<ChunkType> allNsChunks;
        Status status = grid.catalogManager(txn)->getChunks(txn,
                                                            BSON(ChunkType::ns(nss.ns())),
                                                            BSON(ChunkType::min() << 1),
                                                            boost::none,  // all chunks
                                                            &allNsChunks,
                                                            nullptr);
        if (!status.isOK()) {
            warning() << "failed to load chunks for ns " << nss.ns() << causedBy(status);
            continue;
        }

        set<BSONObj> allChunkMinimums;
        map<string, vector<ChunkType>> shardToChunksMap;

        for (const ChunkType& chunk : allNsChunks) {
            allChunkMinimums.insert(chunk.getMin().getOwned());

            vector<ChunkType>& chunksList = shardToChunksMap[chunk.getShard()];
            chunksList.push_back(chunk);
        }

        if (shardToChunksMap.empty()) {
            LOG(1) << "skipping empty collection (" << nss.ns() << ")";
            continue;
        }

        for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) {
            // This loop just makes sure there is an entry in shardToChunksMap for every shard
            shardToChunksMap[i->first];
        }

        DistributionStatus distStatus(shardInfo, shardToChunksMap);

        // TODO: TagRange contains all the information from TagsType except for the namespace,
        //       so maybe the two can be merged at some point in order to avoid the
        //       transformation below.
        vector<TagRange> ranges;

        {
            vector<TagsType> collectionTags;
            uassertStatusOK(
                grid.catalogManager(txn)->getTagsForCollection(txn, nss.ns(), &collectionTags));
            for (const auto& tt : collectionTags) {
                ranges.push_back(
                    TagRange(tt.getMinKey().getOwned(), tt.getMaxKey().getOwned(), tt.getTag()));
                uassert(16356,
                        str::stream() << "tag ranges not valid for: " << nss.ns(),
                        distStatus.addTagRange(ranges.back()));
            }
        }

        auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString());
        if (!statusGetDb.isOK()) {
            warning() << "could not load db config to balance collection [" << nss.ns()
                      << "]: " << statusGetDb.getStatus();
            continue;
        }

        shared_ptr<DBConfig> cfg = statusGetDb.getValue();

        // This line reloads the chunk manager once if this process doesn't know the collection
        // is sharded yet.
        shared_ptr<ChunkManager> cm = cfg->getChunkManagerIfExists(txn, nss.ns(), true);
        if (!cm) {
            warning() << "could not load chunks to balance " << nss.ns() << " collection";
            continue;
        }

        // Loop through tags to make sure no chunk spans tags. Split on tag min for all chunks.
        bool didAnySplits = false;

        for (const TagRange& range : ranges) {
            BSONObj min =
                cm->getShardKeyPattern().getKeyPattern().extendRangeBound(range.min, false);

            if (allChunkMinimums.count(min) > 0) {
                continue;
            }

            didAnySplits = true;

            log() << "nss: " << nss.ns() << " need to split on " << min
                  << " because there is a range there";

            vector<BSONObj> splitPoints;
            splitPoints.push_back(min);

            shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, min);
            Status status = c->multiSplit(txn, splitPoints, NULL);
            if (!status.isOK()) {
                error() << "split failed: " << status;
            } else {
                LOG(1) << "split worked";
            }

            break;
        }

        if (didAnySplits) {
            // State change, just wait till next round
            continue;
        }

        shared_ptr<MigrateInfo> migrateInfo(
            BalancerPolicy::balance(nss.ns(), distStatus, _balancedLastTime));
        if (migrateInfo) {
            candidateChunks.emplace_back(*migrateInfo);
        }
    }

    return candidateChunks;
}
Beispiel #7
0
    virtual void process( Message& m , AbstractMessagingPort* p , LastError * le) {
        verify( p );
        Request r( m , p );

        verify( le );
        lastError.startRequest( m , le );

        try {
            r.init();
            r.process();

            // Release connections after non-write op
            if ( ShardConnection::releaseConnectionsAfterResponse && r.expectResponse() ) {
                LOG(2) << "release thread local connections back to pool" << endl;
                ShardConnection::releaseMyConnections();
            }
        }
        catch ( AssertionException & e ) {
            LOG( e.isUserAssertion() ? 1 : 0 ) << "AssertionException while processing op type : " << m.operation() << " to : " << r.getns() << causedBy(e) << endl;

            le->raiseError( e.getCode() , e.what() );

            m.header()->id = r.id();

            if ( r.expectResponse() ) {
                BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() );
                replyToQuery( ResultFlag_ErrSet, p , m , err );
            }
        }
        catch ( DBException& e ) {
            // note that e.toString() is more detailed on a SocketException than
            // e.what().  we should think about what is the right level of detail both
            // for logging and return code.
            log() << "DBException in process: " << e.what() << endl;

            le->raiseError( e.getCode() , e.what() );

            m.header()->id = r.id();

            if ( r.expectResponse() ) {
                BSONObjBuilder b;
                b.append("$err",e.what()).append("code",e.getCode());
                if( !e._shard.empty() ) {
                    b.append("shard",e._shard);
                }
                replyToQuery( ResultFlag_ErrSet, p , m , b.obj() );
            }
        }
    }
Beispiel #8
0
 std::string causedBy( const DBException& e ){
     return causedBy( e.toString() );
 }
Beispiel #9
0
 std::string causedBy( const std::exception& e ) {
     return causedBy( e.what() );
 }
Beispiel #10
0
    /**
     * Performs sanity check on the given connection string on whether the seed list
     * is consistent with the view of the set using replSetGetStatus.
     */
    bool addReplSetShardCheck( const ConnectionString& servers, string* errMsg ) {
        bool ok = false;
        BSONObj replSetStat;
        try {
            ScopedDbConnection newShardConn(servers.toString());
            ok = newShardConn->runCommand( "admin", BSON( "replSetGetStatus" << 1 ),
                    replSetStat );
            newShardConn.done();
        }
        catch ( const DBException& ex ) {
            *errMsg = str::stream() << "Error encountered while checking status of "
                    << servers.toString() << ": " << causedBy( ex );
        }

        if( !ok ) {
            if ( replSetStat["info"].str() == "configsvr" ) {
                *errMsg = "the specified mongod is a --configsvr and "
                    "should thus not be a shard server";
            }
            else {
                *errMsg = str::stream() << "error encountered calling replSetGetStatus: "
                        << replSetStat;
            }

            return false;
        }

        // if the shard has only one host, make sure it is not part of a replica set
        string setName = replSetStat["set"].str();
        string commandSetName = servers.getSetName();
        if ( commandSetName.empty() && ! setName.empty() ) {
            *errMsg = str::stream() << "host is part of set: " << setName
                        << " use replica set url format <setname>/<server1>,<server2>,....";
            return false;
        }

        if ( !commandSetName.empty() && setName.empty() ) {
            *errMsg = str::stream() << "host did not return a set name, "
                        << "is the replica set still initializing?" << replSetStat;
            return false;
        }

        // if the shard is part of replica set, make sure it is the right one
        if ( ! commandSetName.empty() && ( commandSetName != setName ) ) {
            *errMsg = str::stream() << "host is part of a different set: " << setName;
            return false;
        }

        // if the shard is part of a replica set, make sure all the hosts mentioned in
        // 'servers' are part of the set. It is fine if not all members of the set
        // are present in 'servers'.
        bool foundAll = true;
        string offendingHost;
        if ( ! commandSetName.empty() ) {
            set<string> hostSet;

            BSONElement membersElem( replSetStat["members"] );
            if ( membersElem.type() == Array ) {
                BSONArrayIteratorSorted iter( BSONArray( membersElem.Obj() ));
                while ( iter.more() ) {
                    hostSet.insert( iter.next()["name"].str() ); // host:port
                }

                vector<HostAndPort> hosts = servers.getServers();
                for ( size_t i = 0 ; i < hosts.size() ; i++ ) {
                    if (!hosts[i].hasPort()) {
                        hosts[i].setPort(CmdLine::DefaultDBPort);
                    }
                    string host = hosts[i].toString(); // host:port
                    if ( hostSet.find( host ) == hostSet.end() ) {
                        offendingHost = host;
                        foundAll = false;
                        break;
                    }
                }
            }

            if ( hostSet.empty() ) {
                *errMsg = "replSetGetStatus returned an empty set. "
                        " Please wait for the set to initialize and try again.";
                return false;
            }
        }

        if ( ! foundAll ) {
            *errMsg = str::stream() << "in seed list " << servers.toString()
                            << ", host " << offendingHost
                            << " does not belong to replica set " << setName;
            return false;
        }

        return true;
    }
Beispiel #11
0
    bool Grid::addShard( string* name , const ConnectionString& servers , long long maxSize , string& errMsg ) {
        // name can be NULL, so provide a dummy one here to avoid testing it elsewhere
        string nameInternal;
        if ( ! name ) {
            name = &nameInternal;
        }

        ReplicaSetMonitorPtr rsMonitor;

        // Check whether the host (or set) exists and run several sanity checks on this request.
        // There are two set of sanity checks: making sure adding this particular shard is consistent
        // with the replica set state (if it exists) and making sure this shards databases can be
        // brought into the grid without conflict.

        if ( servers.type() == ConnectionString::SYNC ) {
            errMsg = "can't use sync cluster as a shard for replica set, "
                    "have to use <setname>/<server1>,<server2>,...";
            return false;
        }

        vector<string> dbNames;
        try {
            bool ok = false;

            {
                ScopedDbConnection newShardConn(servers.toString());

                BSONObj resIsMongos;
                ok = newShardConn->runCommand( "admin", BSON( "isdbgrid" << 1 ), resIsMongos );
                newShardConn.done();
            }

            // should return ok=0, cmd not found if it's a normal mongod
            if ( ok ) {
                errMsg = "can't add a mongos process as a shard";
                return false;
            }

            if ( servers.type() == ConnectionString::SET ) {
                if (!addReplSetShardCheck( servers, &errMsg )) {
                    return false;
                }

                // shard name defaults to the name of the replica set
                if ( name->empty() && !servers.getSetName().empty() ) {
                    *name = servers.getSetName();
                }
            }

            // In order to be accepted as a new shard, that mongod must not have any database name
            // that exists already in any other shards. If that test passes, the new shard's
            // databases are going to be entered as non-sharded db's whose primary is the
            // newly added shard.

            BSONObj resListDB;
            {
                ScopedDbConnection newShardConn(servers.toString());
                ok = newShardConn->runCommand( "admin", BSON( "listDatabases" << 1 ), resListDB );
                newShardConn.done();
            }

            if ( !ok ) {
                errMsg = str::stream() << "failed listing " << servers.toString()
                            << "'s databases:" << resListDB;;
                return false;
            }

            BSONObjIterator i( resListDB["databases"].Obj() );
            while ( i.more() ) {
                BSONObj dbEntry = i.next().Obj();
                const string& dbName = dbEntry["name"].String();
                if ( _isSpecialLocalDB( dbName ) ) {
                    // 'local', 'admin', and 'config' are system DBs and should be excluded here
                    continue;
                }
                else {
                    dbNames.push_back( dbName );
                }
            }

            if ( servers.type() == ConnectionString::SET ) {
                rsMonitor = ReplicaSetMonitor::get( servers.getSetName() );
            }
        }
        catch ( DBException& e ) {
            if ( servers.type() == ConnectionString::SET ) {
                ReplicaSetMonitor::remove( servers.getSetName() );
            }

            errMsg = str::stream() << "couldn't connect to new shard " << causedBy(e);
            return false;
        }

        // check that none of the existing shard candidate's db's exist elsewhere
        for ( vector<string>::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) {
            DBConfigPtr config = getDBConfig( *it , false );
            if ( config.get() != NULL ) {
                ostringstream ss;
                ss << "can't add shard " << servers.toString() << " because a local database '" << *it;
                ss << "' exists in another " << config->getPrimary().toString();
                errMsg = ss.str();
                return false;
            }
        }

        // if a name for a shard wasn't provided, pick one.
        if ( name->empty() && ! _getNewShardName( name ) ) {
            errMsg = "error generating new shard name";
            return false;
        }

        // build the ConfigDB shard document
        BSONObjBuilder b;
        b.append(ShardType::name(), *name);
        b.append(ShardType::host(),
                 rsMonitor ? rsMonitor->getServerAddress() : servers.toString());
        if (maxSize > 0) {
            b.append(ShardType::maxSize(), maxSize);
        }
        BSONObj shardDoc = b.obj();

        {
            ScopedDbConnection conn(configServer.getPrimary().getConnString(), 30);

            // check whether the set of hosts (or single host) is not an already a known shard
            BSONObj old = conn->findOne(ShardType::ConfigNS,
                                        BSON(ShardType::host(servers.toString())));

            if ( ! old.isEmpty() ) {
                errMsg = "host already used";
                conn.done();
                return false;
            }

            log() << "going to add shard: " << shardDoc << endl;

            conn->insert(ShardType::ConfigNS , shardDoc);
            errMsg = conn->getLastError();
            if ( ! errMsg.empty() ) {
                log() << "error adding shard: " << shardDoc << " err: " << errMsg << endl;
                conn.done();
                return false;
            }

            conn.done();
        }

        Shard::reloadShardInfo();

        // add all databases of the new shard
        for ( vector<string>::const_iterator it = dbNames.begin(); it != dbNames.end(); ++it ) {
            DBConfigPtr config = getDBConfig( *it , true , *name );
            if ( ! config ) {
                log() << "adding shard " << servers << " even though could not add database " << *it << endl;
            }
        }

        // Record in changelog
        BSONObjBuilder shardDetails;
        shardDetails.append("name", *name);
        shardDetails.append("host", servers.toString());
        configServer.logChange("addShard", "", shardDetails.obj());

        return true;
    }
Beispiel #12
0
    /**
     * Outline of the delete process:
     * 1. Initialize the client for this thread if there is no client. This is for the worker
     *    threads that are attached to any of the threads servicing client requests.
     * 2. Grant this thread authorization to perform deletes.
     * 3. Temporarily enable mode to bypass shard version checks. TODO: Replace this hack.
     * 4. Setup callback to save deletes to moveChunk directory (only if moveParanoia is true).
     * 5. Delete range.
     * 6. Wait until the majority of the secondaries catch up.
     */
    bool RangeDeleterDBEnv::deleteRange(OperationContext* txn,
                                        const RangeDeleteEntry& taskDetails,
                                        long long int* deletedDocs,
                                        std::string* errMsg) {
        const string ns(taskDetails.options.range.ns);
        const BSONObj inclusiveLower(taskDetails.options.range.minKey);
        const BSONObj exclusiveUpper(taskDetails.options.range.maxKey);
        const BSONObj keyPattern(taskDetails.options.range.keyPattern);
        const WriteConcernOptions writeConcern(taskDetails.options.writeConcern);
        const bool fromMigrate = taskDetails.options.fromMigrate;
        const bool onlyRemoveOrphans = taskDetails.options.onlyRemoveOrphanedDocs;

        const bool initiallyHaveClient = haveClient();

        if (!initiallyHaveClient) {
            Client::initThread("RangeDeleter");
        }

        *deletedDocs = 0;
        ShardForceVersionOkModeBlock forceVersion;
        {
            Helpers::RemoveSaver removeSaver("moveChunk",
                                             ns,
                                             taskDetails.options.removeSaverReason);
            Helpers::RemoveSaver* removeSaverPtr = NULL;
            if (serverGlobalParams.moveParanoia &&
                    !taskDetails.options.removeSaverReason.empty()) {
                removeSaverPtr = &removeSaver;
            }

            // log the opId so the user can use it to cancel the delete using killOp.
            unsigned int opId = txn->getCurOp()->opNum();
            log() << "Deleter starting delete for: " << ns
                  << " from " << inclusiveLower
                  << " -> " << exclusiveUpper
                  << ", with opId: " << opId
                  << endl;

            try {
                *deletedDocs =
                        Helpers::removeRange(txn,
                                             KeyRange(ns,
                                                      inclusiveLower,
                                                      exclusiveUpper,
                                                      keyPattern),
                                             false, /*maxInclusive*/
                                             writeConcern,
                                             removeSaverPtr,
                                             fromMigrate,
                                             onlyRemoveOrphans);

                if (*deletedDocs < 0) {
                    *errMsg = "collection or index dropped before data could be cleaned";
                    warning() << *errMsg << endl;

                    if (!initiallyHaveClient) {
                        txn->getClient()->shutdown();
                    }

                    return false;
                }

                log() << "rangeDeleter deleted " << *deletedDocs
                      << " documents for " << ns
                      << " from " << inclusiveLower
                      << " -> " << exclusiveUpper
                      << endl;
            }
            catch (const DBException& ex) {
                *errMsg = str::stream() << "Error encountered while deleting range: "
                                        << "ns" << ns
                                        << " from " << inclusiveLower
                                        << " -> " << exclusiveUpper
                                        << ", cause by:" << causedBy(ex);

                if (!initiallyHaveClient) {
                    txn->getClient()->shutdown();
                }

                return false;
            }
        }

        if (!initiallyHaveClient) {
            txn->getClient()->shutdown();
        }

        return true;
    }
StatusWith<ShardType> ShardingCatalogManagerImpl::_validateHostAsShard(
    OperationContext* txn,
    std::shared_ptr<RemoteCommandTargeter> targeter,
    const std::string* shardProposedName,
    const ConnectionString& connectionString) {
    // Check whether any host in the connection is already part of the cluster.
    Grid::get(txn)->shardRegistry()->reload(txn);
    for (const auto& hostAndPort : connectionString.getServers()) {
        std::shared_ptr<Shard> shard;
        shard = Grid::get(txn)->shardRegistry()->getShardNoReload(hostAndPort.toString());
        if (shard) {
            return {ErrorCodes::OperationFailed,
                    str::stream() << "'" << hostAndPort.toString() << "' "
                    << "is already a member of the existing shard '"
                    << shard->getConnString().toString()
                    << "' ("
                    << shard->getId()
                    << ")."};
        }
    }

    // Check for mongos and older version mongod connections, and whether the hosts
    // can be found for the user specified replset.
    auto swCommandResponse =
        _runCommandForAddShard(txn, targeter.get(), "admin", BSON("isMaster" << 1));
    if (!swCommandResponse.isOK()) {
        if (swCommandResponse.getStatus() == ErrorCodes::RPCProtocolNegotiationFailed) {
            // Mongos to mongos commands are no longer supported in the wire protocol
            // (because mongos does not support OP_COMMAND), similarly for a new mongos
            // and an old mongod. So the call will fail in such cases.
            // TODO: If/When mongos ever supports opCommands, this logic will break because
            // cmdStatus will be OK.
            return {ErrorCodes::RPCProtocolNegotiationFailed,
                    str::stream() << targeter->connectionString().toString()
                    << " does not recognize the RPC protocol being used. This is"
                    << " likely because it contains a node that is a mongos or an old"
                    << " version of mongod."};
        } else {
            return swCommandResponse.getStatus();
        }
    }

    // Check for a command response error
    auto resIsMasterStatus = std::move(swCommandResponse.getValue().commandStatus);
    if (!resIsMasterStatus.isOK()) {
        return {resIsMasterStatus.code(),
                str::stream() << "Error running isMaster against "
                << targeter->connectionString().toString()
                << ": "
                << causedBy(resIsMasterStatus)};
    }

    auto resIsMaster = std::move(swCommandResponse.getValue().response);

    // Check whether there is a master. If there isn't, the replica set may not have been
    // initiated. If the connection is a standalone, it will return true for isMaster.
    bool isMaster;
    Status status = bsonExtractBooleanField(resIsMaster, "ismaster", &isMaster);
    if (!status.isOK()) {
        return Status(status.code(),
                      str::stream() << "isMaster returned invalid 'ismaster' "
                      << "field when attempting to add "
                      << connectionString.toString()
                      << " as a shard: "
                      << status.reason());
    }
    if (!isMaster) {
        return {ErrorCodes::NotMaster,
                str::stream()
                << connectionString.toString()
                << " does not have a master. If this is a replica set, ensure that it has a"
                << " healthy primary and that the set has been properly initiated."};
    }

    const string providedSetName = connectionString.getSetName();
    const string foundSetName = resIsMaster["setName"].str();

    // Make sure the specified replica set name (if any) matches the actual shard's replica set
    if (providedSetName.empty() && !foundSetName.empty()) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "host is part of set " << foundSetName << "; "
                << "use replica set url format "
                << "<setname>/<server1>,<server2>, ..."};
    }

    if (!providedSetName.empty() && foundSetName.empty()) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "host did not return a set name; "
                << "is the replica set still initializing? "
                << resIsMaster};
    }

    // Make sure the set name specified in the connection string matches the one where its hosts
    // belong into
    if (!providedSetName.empty() && (providedSetName != foundSetName)) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "the provided connection string (" << connectionString.toString()
                << ") does not match the actual set name "
                << foundSetName};
    }

    // Is it a config server?
    if (resIsMaster.hasField("configsvr")) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "Cannot add " << connectionString.toString()
                << " as a shard since it is a config server"};
    }

    // If the shard is part of a replica set, make sure all the hosts mentioned in the connection
    // string are part of the set. It is fine if not all members of the set are mentioned in the
    // connection string, though.
    if (!providedSetName.empty()) {
        std::set<string> hostSet;

        BSONObjIterator iter(resIsMaster["hosts"].Obj());
        while (iter.more()) {
            hostSet.insert(iter.next().String());  // host:port
        }

        if (resIsMaster["passives"].isABSONObj()) {
            BSONObjIterator piter(resIsMaster["passives"].Obj());
            while (piter.more()) {
                hostSet.insert(piter.next().String());  // host:port
            }
        }

        if (resIsMaster["arbiters"].isABSONObj()) {
            BSONObjIterator piter(resIsMaster["arbiters"].Obj());
            while (piter.more()) {
                hostSet.insert(piter.next().String());  // host:port
            }
        }

        vector<HostAndPort> hosts = connectionString.getServers();
        for (size_t i = 0; i < hosts.size(); i++) {
            const string host = hosts[i].toString();  // host:port
            if (hostSet.find(host) == hostSet.end()) {
                return {ErrorCodes::OperationFailed,
                        str::stream() << "in seed list " << connectionString.toString() << ", host "
                        << host
                        << " does not belong to replica set "
                        << foundSetName
                        << "; found "
                        << resIsMaster.toString()};
            }
        }
    }

    string actualShardName;

    if (shardProposedName) {
        actualShardName = *shardProposedName;
    } else if (!foundSetName.empty()) {
        // Default it to the name of the replica set
        actualShardName = foundSetName;
    }

    // Disallow adding shard replica set with name 'config'
    if (actualShardName == "config") {
        return {ErrorCodes::BadValue, "use of shard replica set with name 'config' is not allowed"};
    }

    // Retrieve the most up to date connection string that we know from the replica set monitor (if
    // this is a replica set shard, otherwise it will be the same value as connectionString).
    ConnectionString actualShardConnStr = targeter->connectionString();

    ShardType shard;
    shard.setName(actualShardName);
    shard.setHost(actualShardConnStr.toString());

    return shard;
}
Beispiel #14
0
        void runThread(ConnectionString& hostConn, unsigned threadId, unsigned seed,
                       BSONObj& cmdObj, BSONObjBuilder& result) {

            stringstream ss;
            ss << "thread-" << threadId;
            setThreadName(ss.str().c_str());

            // Lock name
            string lockName = string_field(cmdObj, "lockName", this->name + "_lock");

            // Range of clock skew in diff threads
            int skewRange = (int) number_field(cmdObj, "skewRange", 1);

            // How long to wait with the lock
            int threadWait = (int) number_field(cmdObj, "threadWait", 30);
            if(threadWait <= 0) threadWait = 1;

            // Max amount of time (ms) a thread waits before checking the lock again
            int threadSleep = (int) number_field(cmdObj, "threadSleep", 30);
            if(threadSleep <= 0) threadSleep = 1;

            // How long until the lock is forced in ms, only compared locally
            unsigned long long takeoverMS = (unsigned long long) number_field(cmdObj, "takeoverMS", 0);

            // Whether or not we should hang some threads
            int hangThreads = (int) number_field(cmdObj, "hangThreads", 0);


            boost::mt19937 gen((boost::mt19937::result_type) seed);

            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSkew(gen, boost::uniform_int<>(0, skewRange));
            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomWait(gen, boost::uniform_int<>(1, threadWait));
            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSleep(gen, boost::uniform_int<>(1, threadSleep));


            int skew = 0;
            if (!lock.get()) {

                // Pick a skew, but the first two threads skew the whole range
                if(threadId == 0)
                    skew = -skewRange / 2;
                else if(threadId == 1)
                    skew = skewRange / 2;
                else skew = randomSkew() - (skewRange / 2);

                // Skew this thread
                jsTimeVirtualThreadSkew( skew );

                log() << "Initializing lock with skew of " << skew << " for thread " << threadId << endl;

                lock.reset(new DistributedLock(hostConn, lockName, takeoverMS, true ));

                log() << "Skewed time " << jsTime() << "  for thread " << threadId << endl
                      << "  max wait (with lock: " << threadWait << ", after lock: " << threadSleep << ")" << endl
                      << "  takeover in " << takeoverMS << "(ms remote)" << endl;

            }

            DistributedLock* myLock = lock.get();

            bool errors = false;
            BSONObj lockObj;
            while (keepGoing) {
                try {

                    if (myLock->lock_try("Testing distributed lock with skew.", false, &lockObj )) {

                        log() << "**** Locked for thread " << threadId << " with ts " << lockObj["ts"] << endl;

                        if( count % 2 == 1 && ! myLock->lock_try( "Testing lock re-entry.", true ) ) {
                            errors = true;
                            log() << "**** !Could not re-enter lock already held" << endl;
                            break;
                        }

                        if( count % 3 == 1 && myLock->lock_try( "Testing lock non-re-entry.", false ) ) {
                            errors = true;
                            log() << "**** !Invalid lock re-entry" << endl;
                            break;
                        }

                        count++;
                        int before = count;
                        int sleep = randomWait();
                        sleepmillis(sleep);
                        int after = count;

                        if(after != before) {
                            errors = true;
                            log() << "**** !Bad increment while sleeping with lock for: " << sleep << "ms" << endl;
                            break;
                        }

                        // Unlock only half the time...
                        if(hangThreads == 0 || threadId % hangThreads != 0) {
                            log() << "**** Unlocking for thread " << threadId << " with ts " << lockObj["ts"] << endl;
                            myLock->unlock( &lockObj );
                        }
                        else {
                            log() << "**** Not unlocking for thread " << threadId << endl;
                            DistributedLock::killPinger( *myLock );
                            // We're simulating a crashed process...
                            break;
                        }
                    }

                }
                catch( LockException& e ) {
                    log() << "*** !Could not try distributed lock." << causedBy( e ) << endl;
                    break;
                }

                sleepmillis(randomSleep());
            }

            result << "errors" << errors
                   << "skew" << skew
                   << "takeover" << (long long) takeoverMS
                   << "localTimeout" << (takeoverMS > 0);

        }
Beispiel #15
0
    DBClientBase* DBClientReplicaSet::callLazy( Message& toSend ) {
        if ( toSend.operation() == dbQuery ) {
            // TODO: might be possible to do this faster by changing api
            DbMessage dm( toSend );
            QueryMessage qm( dm );
            if ( qm.queryOptions & QueryOption_SlaveOk ) {
                for ( int i=0; i<3; i++ ) {
                    try {
                        return checkSlave()->callLazy( toSend );
                    }
                    catch ( DBException &e ) {
                    	LOG(1) << "can't callLazy replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
                    }
                }
            }
        }

        return checkMaster()->callLazy( toSend );
    }
Beispiel #16
0
 std::string causedBy( const std::string& e ){
     return causedBy( e.c_str() );
 }
Beispiel #17
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();
        if (!_init(txn.get())) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    Seconds balanceRoundInterval(kBalanceRoundDefaultInterval);

    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();

        BalanceRoundDetails roundDetails;

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping(txn.get());

            MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) {
                const BSONObj& data = scopedBalancerRoundInterval.getData();
                balanceRoundInterval = Seconds(data["sleepSecs"].numberInt());
            }

            BSONObj balancerResult;

            // use fresh shard state
            grid.shardRegistry()->reload(txn.get());

            // refresh chunk size (even though another balancer might be active)
            Chunk::refreshChunkSize(txn.get());

            auto balSettingsResult = grid.catalogManager(txn.get())->getGlobalSettings(
                txn.get(), SettingsType::BalancerDocKey);
            const bool isBalSettingsAbsent =
                balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument;
            if (!balSettingsResult.isOK() && !isBalSettingsAbsent) {
                warning() << balSettingsResult.getStatus();
                return;
            }

            const SettingsType& balancerConfig =
                isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue();

            // now make sure we should even be running
            if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) ||
                MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(txn.get(), true);

                sleepFor(balanceRoundInterval);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get()));

            {
                auto scopedDistLock = grid.catalogManager(txn.get())
                                          ->distLock(txn.get(),
                                                     "balancer",
                                                     "doing balance round",
                                                     DistLockManager::kSingleLockAttemptTimeout);

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(txn.get(), true);

                    sleepFor(balanceRoundInterval);  // no need to wake up soon
                    continue;
                }

                const bool waitForDelete =
                    (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete()
                                                         : false);

                MigrationSecondaryThrottleOptions secondaryThrottle(
                    MigrationSecondaryThrottleOptions::create(
                        MigrationSecondaryThrottleOptions::kDefault));
                if (balancerConfig.isKeySet()) {
                    secondaryThrottle =
                        uassertStatusOK(MigrationSecondaryThrottleOptions::createFromBalancerConfig(
                            balancerConfig.toBSON()));
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << waitForDelete
                       << ", secondaryThrottle: " << secondaryThrottle.toBSON();

                const auto candidateChunks = uassertStatusOK(_getCandidateChunks(txn.get()));

                if (candidateChunks.empty()) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime =
                        _moveChunks(txn.get(), candidateChunks, secondaryThrottle, waitForDelete);

                    roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()),
                                              _balancedLastTime);

                    grid.catalogManager(txn.get())
                        ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());
                }

                LOG(1) << "*** End of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(txn.get(), true);

            sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval);
        } catch (const std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            roundDetails.setFailed(e.what());

            grid.catalogManager(txn.get())
                ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());

            // Sleep a fair amount before retrying because of the error
            sleepFor(balanceRoundInterval);
        }
    }
}
Beispiel #18
0
 std::string causedBy( const std::string* e ) {
     return (e && *e != "") ? causedBy(*e) : "";
 }
Beispiel #19
0
int Balancer::_moveChunks(OperationContext* txn,
                          const vector<MigrateInfo>& candidateChunks,
                          const MigrationSecondaryThrottleOptions& secondaryThrottle,
                          bool waitForDelete) {
    int movedCount = 0;

    for (const auto& migrateInfo : candidateChunks) {
        // If the balancer was disabled since we started this round, don't start new chunks
        // moves.
        const auto balSettingsResult =
            grid.catalogManager(txn)->getGlobalSettings(txn, SettingsType::BalancerDocKey);

        const bool isBalSettingsAbsent =
            balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument;

        if (!balSettingsResult.isOK() && !isBalSettingsAbsent) {
            warning() << balSettingsResult.getStatus();
            return movedCount;
        }

        const SettingsType& balancerConfig =
            isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue();

        if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) ||
            MONGO_FAIL_POINT(skipBalanceRound)) {
            LOG(1) << "Stopping balancing round early as balancing was disabled";
            return movedCount;
        }

        // Changes to metadata, borked metadata, and connectivity problems between shards
        // should cause us to abort this chunk move, but shouldn't cause us to abort the entire
        // round of chunks.
        //
        // TODO(spencer): We probably *should* abort the whole round on issues communicating
        // with the config servers, but its impossible to distinguish those types of failures
        // at the moment.
        //
        // TODO: Handle all these things more cleanly, since they're expected problems

        const NamespaceString nss(migrateInfo.ns);

        try {
            shared_ptr<DBConfig> cfg =
                uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString()));

            // NOTE: We purposely do not reload metadata here, since _getCandidateChunks already
            // tried to do so once
            shared_ptr<ChunkManager> cm = cfg->getChunkManager(txn, migrateInfo.ns);
            uassert(28628,
                    str::stream()
                        << "Collection " << migrateInfo.ns
                        << " was deleted while balancing was active. Aborting balancing round.",
                    cm);

            ChunkPtr c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min);

            if (c->getMin().woCompare(migrateInfo.chunk.min) ||
                c->getMax().woCompare(migrateInfo.chunk.max)) {
                // Likely a split happened somewhere, so force reload the chunk manager
                cm = cfg->getChunkManager(txn, migrateInfo.ns, true);
                invariant(cm);

                c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min);

                if (c->getMin().woCompare(migrateInfo.chunk.min) ||
                    c->getMax().woCompare(migrateInfo.chunk.max)) {
                    log() << "chunk mismatch after reload, ignoring will retry issue "
                          << migrateInfo.chunk.toString();

                    continue;
                }
            }

            BSONObj res;
            if (c->moveAndCommit(txn,
                                 migrateInfo.to,
                                 Chunk::MaxChunkSize,
                                 secondaryThrottle,
                                 waitForDelete,
                                 0, /* maxTimeMS */
                                 res)) {
                movedCount++;
                continue;
            }

            // The move requires acquiring the collection metadata's lock, which can fail.
            log() << "balancer move failed: " << res << " from: " << migrateInfo.from
                  << " to: " << migrateInfo.to << " chunk: " << migrateInfo.chunk;

            Status moveStatus = getStatusFromCommandResult(res);

            if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) {
                // Reload just to be safe
                cm = cfg->getChunkManager(txn, migrateInfo.ns);
                invariant(cm);

                c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min);

                log() << "performing a split because migrate failed for size reasons";

                Status status = c->split(txn, Chunk::normal, NULL, NULL);
                log() << "split results: " << status;

                if (!status.isOK()) {
                    log() << "marking chunk as jumbo: " << c->toString();

                    c->markAsJumbo(txn);

                    // We increment moveCount so we do another round right away
                    movedCount++;
                }
            }
        } catch (const DBException& ex) {
            warning() << "could not move chunk " << migrateInfo.chunk.toString()
                      << ", continuing balancing round" << causedBy(ex);
        }
    }

    return movedCount;
}
Beispiel #20
0
 std::string causedBy( const Status& e ){
     return causedBy( e.reason() );
 }
Beispiel #21
0
    bool ClientInfo::getLastError( const string& dbName,
                                   const BSONObj& options,
                                   BSONObjBuilder& result,
                                   string& errmsg,
                                   bool fromWriteBackListener)
    {

        set<string> * shards = getPrev();

        if ( shards->size() == 0 ) {
            result.appendNull( "err" );
            return true;
        }

        vector<WBInfo> writebacks;

        //
        // TODO: These branches should be collapsed into a single codepath
        //

        // handle single server
        if ( shards->size() == 1 ) {
            string theShard = *(shards->begin() );

            BSONObj res;
            bool ok = false;
            {
                LOG(5) << "gathering response for gle from: " << theShard << endl;

                ShardConnection conn( theShard , "" );
                try {
                    ok = conn->runCommand( dbName , options , res );
                }
                catch( std::exception &e ) {

                    string message =
                            str::stream() << "could not get last error from shard " << theShard
                                          << causedBy( e );

                    warning() << message << endl;
                    errmsg = message;

                    // Catch everything that happens here, since we need to ensure we return our connection when we're
                    // finished.
                    conn.done();

                    return false;
                }


                res = res.getOwned();
                conn.done();
            }

            _addWriteBack( writebacks, res, true );

            LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for"
                   << " gle (" << theShard << ")" << endl;

            // hit other machines just to block
            for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
                string temp = *i;
                if ( temp == theShard )
                    continue;

                LOG(5) << "gathering writebacks for single-shard gle from: " << temp << endl;

                try {
                    ShardConnection conn( temp , "" );
                    ON_BLOCK_EXIT_OBJ( conn, &ShardConnection::done );
                    _addWriteBack( writebacks, conn->getLastErrorDetailed(), false );

                }
                catch( std::exception &e ){
                    warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl;
                }

            }
            clearSinceLastGetError();

            LOG(4) << "checking " << writebacks.size() << " writebacks for"
                   << " gle (" << theShard << ")" << endl;

            if ( writebacks.size() ){
                vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener );
                if ( v.size() == 0 && fromWriteBackListener ) {
                    // ok
                }
                else {
                    // this will usually be 1
                    // it can be greater than 1 if a write to a different shard
                    // than the last write op had a writeback
                    // all we're going to report is the first
                    // since that's the current write
                    // but we block for all
                    verify( v.size() >= 1 );

                    if ( res["writebackSince"].numberInt() > 0 ) {
                        // got writeback from older op
                        // ignore the result from it, just needed to wait
                        result.appendElements( res );
                    }
                    else if ( writebacks[0].fromLastOperation ) {
                        result.appendElements( v[0] );
                        result.appendElementsUnique( res );
                        result.append( "writebackGLE" , v[0] );
                        result.append( "initialGLEHost" , theShard );
                        result.append( "initialGLE", res );
                    }
                    else {
                        // there was a writeback
                        // but its from an old operations
                        // so all that's important is that we block, not that we return stats
                        result.appendElements( res );
                    }
                }
            }
            else {
                result.append( "singleShard" , theShard );
                result.appendElements( res );
            }

            return ok;
        }

        BSONArrayBuilder bbb( result.subarrayStart( "shards" ) );
        BSONObjBuilder shardRawGLE;

        long long n = 0;

        int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true

        // hit each shard
        vector<string> errors;
        vector<BSONObj> errorObjects;
        for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) {
            string theShard = *i;
            bbb.append( theShard );

            LOG(5) << "gathering a response for gle from: " << theShard << endl;

            boost::scoped_ptr<ShardConnection> conn;
            BSONObj res;
            bool ok = false;
            try {
                conn.reset( new ShardConnection( theShard , "" ) ); // constructor can throw if shard is down
                ok = (*conn)->runCommand( dbName , options , res );
                shardRawGLE.append( theShard , res );
            }
            catch( std::exception &e ){

                // Safe to return here, since we haven't started any extra processing yet, just collecting
                // responses.

                string message =
                        str::stream() << "could not get last error from a shard " << theShard
                                      << causedBy( e );

                warning() << message << endl;
                errmsg = message;

                if (conn)
                    conn->done();

                return false;
            }

            _addWriteBack( writebacks, res, true );

            string temp = DBClientWithCommands::getLastErrorString( res );
            if ( (*conn)->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) {
                errors.push_back( temp );
                errorObjects.push_back( res );
            }

            n += res["n"].numberLong();
            if ( res["updatedExisting"].type() ) {
                if ( res["updatedExisting"].trueValue() )
                    updatedExistingStat = 1;
                else if ( updatedExistingStat == 0 )
                    updatedExistingStat = -1;
            }

            conn->done();
        }

        bbb.done();
        result.append( "shardRawGLE" , shardRawGLE.obj() );

        result.appendNumber( "n" , n );
        if ( updatedExistingStat )
            result.appendBool( "updatedExisting" , updatedExistingStat > 0 );

        LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for"
               << " gle (" << shards->size() << " shards)" << endl;

        // hit other machines just to block
        for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
            string temp = *i;
            if ( shards->count( temp ) )
                continue;

            LOG(5) << "gathering writebacks for multi-shard gle from: " << temp << endl;

            ShardConnection conn( temp , "" );
            try {
                _addWriteBack( writebacks, conn->getLastErrorDetailed(), false );
            }
            catch( std::exception &e ){
                warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl;
            }
            conn.done();
        }
        clearSinceLastGetError();

        LOG(4) << "checking " << writebacks.size() << " writebacks for"
                << " gle (" << shards->size() << " shards)" << endl;

        if ( errors.size() == 0 ) {
            result.appendNull( "err" );
            _handleWriteBacks( writebacks , fromWriteBackListener );
            return true;
        }

        result.append( "err" , errors[0].c_str() );

        {
            // errs
            BSONArrayBuilder all( result.subarrayStart( "errs" ) );
            for ( unsigned i=0; i<errors.size(); i++ ) {
                all.append( errors[i].c_str() );
            }
            all.done();
        }

        {
            // errObjects
            BSONArrayBuilder all( result.subarrayStart( "errObjects" ) );
            for ( unsigned i=0; i<errorObjects.size(); i++ ) {
                all.append( errorObjects[i] );
            }
            all.done();
        }

        _handleWriteBacks( writebacks , fromWriteBackListener );
        return true;
    }
Beispiel #22
0
void Socket::handleSendError(int ret, const char* context) {
#if defined(_WIN32)
    const int mongo_errno = WSAGetLastError();
    if (mongo_errno == WSAETIMEDOUT && _timeout != 0) {
#else
    const int mongo_errno = errno;
    if ((mongo_errno == EAGAIN || mongo_errno == EWOULDBLOCK) && _timeout != 0) {
#endif
        LOG(_logLevel) << "Socket " << context << " send() timed out " << remoteString();
        throw SocketException(SocketException::SEND_TIMEOUT, remoteString());
    } else if (mongo_errno != EINTR) {
        LOG(_logLevel) << "Socket " << context << " send() " << errnoWithDescription(mongo_errno)
                       << ' ' << remoteString();
        throw SocketException(SocketException::SEND_ERROR, remoteString());
    }
}

void Socket::handleRecvError(int ret, int len) {
    if (ret == 0) {
        LOG(3) << "Socket recv() conn closed? " << remoteString();
        throw SocketException(SocketException::CLOSED, remoteString());
    }

// ret < 0
#if defined(_WIN32)
    int e = WSAGetLastError();
#else
    int e = errno;
#if defined(EINTR)
    if (e == EINTR) {
        return;
    }
#endif
#endif

#if defined(_WIN32)
    // Windows
    if ((e == EAGAIN || e == WSAETIMEDOUT) && _timeout > 0) {
#else
    if (e == EAGAIN && _timeout > 0) {
#endif
        // this is a timeout
        LOG(_logLevel) << "Socket recv() timeout  " << remoteString();
        throw SocketException(SocketException::RECV_TIMEOUT, remoteString());
    }

    LOG(_logLevel) << "Socket recv() " << errnoWithDescription(e) << " " << remoteString();
    throw SocketException(SocketException::RECV_ERROR, remoteString());
}

void Socket::setTimeout(double secs) {
    setSockTimeouts(_fd, secs);
}

// TODO: allow modification?
//
// <positive value> : secs to wait between stillConnected checks
// 0 : always check
// -1 : never check
const int Socket::errorPollIntervalSecs(5);

// Patch to allow better tolerance of flaky network connections that get broken
// while we aren't looking.
// TODO: Remove when better async changes come.
//
// isStillConnected() polls the socket at max every Socket::errorPollIntervalSecs to determine
// if any disconnection-type events have happened on the socket.
bool Socket::isStillConnected() {
    if (_fd == INVALID_SOCKET) {
        // According to the man page, poll will respond with POLLVNAL for invalid or
        // unopened descriptors, but it doesn't seem to be properly implemented in
        // some platforms - it can return 0 events and 0 for revent. Hence this workaround.
        return false;
    }

    if (errorPollIntervalSecs < 0)
        return true;
    if (!isPollSupported())
        return true;  // nothing we can do

    time_t now = time(0);
    time_t idleTimeSecs = now - _lastValidityCheckAtSecs;

    // Only check once every 5 secs
    if (idleTimeSecs < errorPollIntervalSecs)
        return true;
    // Reset our timer, we're checking the connection
    _lastValidityCheckAtSecs = now;

    // It's been long enough, poll to see if our socket is still connected

    pollfd pollInfo;
    pollInfo.fd = _fd;
    // We only care about reading the EOF message on clean close (and errors)
    pollInfo.events = POLLIN;

    // Poll( info[], size, timeout ) - timeout == 0 => nonblocking
    int nEvents = socketPoll(&pollInfo, 1, 0);

    LOG(2) << "polling for status of connection to " << remoteString() << ", "
           << (nEvents == 0 ? "no events" : nEvents == -1 ? "error detected" : "event detected");

    if (nEvents == 0) {
        // No events incoming, return still connected AFAWK
        return true;
    } else if (nEvents < 0) {
        // Poll itself failed, this is weird, warn and log errno
        warning() << "Socket poll() failed during connectivity check"
                  << " (idle " << idleTimeSecs << " secs,"
                  << " remote host " << remoteString() << ")" << causedBy(errnoWithDescription());

        // Return true since it's not clear that we're disconnected.
        return true;
    }

    dassert(nEvents == 1);
    dassert(pollInfo.revents > 0);

    // Return false at this point, some event happened on the socket, but log what the
    // actual event was.

    if (pollInfo.revents & POLLIN) {
        // There shouldn't really be any data to recv here, so make sure this
        // is a clean hangup.

        const int testBufLength = 1024;
        char testBuf[testBufLength];

        int recvd = ::recv(_fd, testBuf, testBufLength, portRecvFlags);

        if (recvd < 0) {
            // An error occurred during recv, warn and log errno
            warning() << "Socket recv() failed during connectivity check"
                      << " (idle " << idleTimeSecs << " secs,"
                      << " remote host " << remoteString() << ")"
                      << causedBy(errnoWithDescription());
        } else if (recvd > 0) {
            // We got nonzero data from this socket, very weird?
            // Log and warn at runtime, log and abort at devtime
            // TODO: Dump the data to the log somehow?
            error() << "Socket found pending " << recvd
                    << " bytes of data during connectivity check"
                    << " (idle " << idleTimeSecs << " secs,"
                    << " remote host " << remoteString() << ")";
            DEV {
                std::string hex = hexdump(testBuf, recvd);
                error() << "Hex dump of stale log data: " << hex;
            }
            dassert(false);
        } else {
            // recvd == 0, socket closed remotely, just return false
            LOG(0) << "Socket closed remotely, no longer connected"
                   << " (idle " << idleTimeSecs << " secs,"
                   << " remote host " << remoteString() << ")";
        }
    } else if (pollInfo.revents & POLLHUP) {
Beispiel #23
0
    BSONObj DBClientReplicaSet::findOne(const string &ns, const Query& query, const BSONObj *fieldsToReturn, int queryOptions) {
        if ( queryOptions & QueryOption_SlaveOk ) {
            // we're ok sending to a slave
            // we'll try 2 slaves before just using master
            // checkSlave will try a different slave automatically after a failure
            for ( int i=0; i<3; i++ ) {
                try {
                    return checkSlave()->findOne(ns,query,fieldsToReturn,queryOptions);
                }
                catch ( DBException &e ) {
                	LOG(1) << "can't findone replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
                }
            }
        }

        return checkMaster()->findOne(ns,query,fieldsToReturn,queryOptions);
    }
Beispiel #24
0
        virtual void process( Message& m , AbstractMessagingPort* p , LastError * le) {
            verify( p );
            Request r( m , p );

            verify( le );
            lastError.startRequest( m , le );

            try {
                r.init();
                r.process();
            }
            catch ( AssertionException & e ) {
                log( e.isUserAssertion() ? 1 : 0 ) << "AssertionException while processing op type : " << m.operation() << " to : " << r.getns() << causedBy(e) << endl;

                le->raiseError( e.getCode() , e.what() );

                m.header()->id = r.id();

                if ( r.expectResponse() ) {
                    BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() );
                    replyToQuery( ResultFlag_ErrSet, p , m , err );
                }
            }
            catch ( DBException& e ) {
                log() << "DBException in process: " << e.what() << endl;

                le->raiseError( e.getCode() , e.what() );

                m.header()->id = r.id();

                if ( r.expectResponse() ) {
                    BSONObj err = BSON( "$err" << e.what() << "code" << e.getCode() );
                    replyToQuery( ResultFlag_ErrSet, p , m , err );
                }
            }
        }
Beispiel #25
0
 bool DBClientReplicaSet::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
     if ( toSend.operation() == dbQuery ) {
         // TODO: might be possible to do this faster by changing api
         DbMessage dm( toSend );
         QueryMessage qm( dm );
         if ( qm.queryOptions & QueryOption_SlaveOk ) {
             for ( int i=0; i<3; i++ ) {
                 try {
                     DBClientConnection* s = checkSlave();
                     if ( actualServer )
                         *actualServer = s->getServerAddress();
                     return s->call( toSend , response , assertOk );
                 }
                 catch ( DBException &e ) {
                 	LOG(1) << "can't call replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl;
                     if ( actualServer )
                         *actualServer = "";
                 }
             }
         }
     }
     
     DBClientConnection* m = checkMaster();
     if ( actualServer )
         *actualServer = m->getServerAddress();
     return m->call( toSend , response , assertOk );
 }
Beispiel #26
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();
        if (!_init(txn.get())) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    Seconds balanceRoundInterval(kBalanceRoundDefaultInterval);

    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();

        BalanceRoundDetails roundDetails;

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping(txn.get(), false);

            MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) {
                const BSONObj& data = scopedBalancerRoundInterval.getData();
                balanceRoundInterval = Seconds(data["sleepSecs"].numberInt());
            }

            // Use fresh shard state and balancer settings
            Grid::get(txn.get())->shardRegistry()->reload(txn.get());

            auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration();
            Status refreshStatus = balancerConfig->refreshAndCheck(txn.get());
            if (!refreshStatus.isOK()) {
                warning() << "Skipping balancing round" << causedBy(refreshStatus);
                sleepFor(balanceRoundInterval);
                continue;
            }

            // now make sure we should even be running
            if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(txn.get(), true);

                sleepFor(balanceRoundInterval);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get()));

            {
                auto scopedDistLock = grid.catalogManager(txn.get())
                                          ->distLock(txn.get(),
                                                     "balancer",
                                                     "doing balance round",
                                                     DistLockManager::kSingleLockAttemptTimeout);

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(txn.get(), true);

                    sleepFor(balanceRoundInterval);  // no need to wake up soon
                    continue;
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << balancerConfig->waitForDelete()
                       << ", secondaryThrottle: "
                       << balancerConfig->getSecondaryThrottle().toBSON();

                OCCASIONALLY warnOnMultiVersion(
                    uassertStatusOK(_clusterStats->getStats(txn.get())));

                Status status = _enforceTagRanges(txn.get());
                if (!status.isOK()) {
                    warning() << "Failed to enforce tag ranges" << causedBy(status);
                } else {
                    LOG(1) << "Done enforcing tag range boundaries.";
                }

                const auto candidateChunks = uassertStatusOK(
                    _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime));

                if (candidateChunks.empty()) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime = _moveChunks(txn.get(),
                                                    candidateChunks,
                                                    balancerConfig->getSecondaryThrottle(),
                                                    balancerConfig->waitForDelete());

                    roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()),
                                              _balancedLastTime);

                    grid.catalogManager(txn.get())
                        ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());
                }

                LOG(1) << "*** End of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(txn.get(), true);

            sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval);
        } catch (const std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            roundDetails.setFailed(e.what());

            grid.catalogManager(txn.get())
                ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());

            // Sleep a fair amount before retrying because of the error
            sleepFor(balanceRoundInterval);
        }
    }
}
// Semantics of this method are basically that if the lock cannot be acquired, returns false,
// can be retried. If the lock should not be tried again (some unexpected error),
// a LockException is thrown.
bool DistributedLock::lock_try(const OID& lockID,
                               const string& why,
                               BSONObj* other,
                               double timeout) {
    // This should always be true, if not, we are using the lock incorrectly.
    verify(_name != "");

    auto lockTimeout = _lockTimeout;
    MONGO_FAIL_POINT_BLOCK(setSCCCDistLockTimeout, customTimeout) {
        const BSONObj& data = customTimeout.getData();
        lockTimeout = data["timeoutMs"].numberInt();
    }
    LOG(logLvl) << "trying to acquire new distributed lock for " << _name << " on " << _conn
                << " ( lock timeout : " << lockTimeout << ", ping interval : " << _lockPing
                << ", process : " << _processId << " )" << endl;

    // write to dummy if 'other' is null
    BSONObj dummyOther;
    if (other == NULL)
        other = &dummyOther;

    ScopedDbConnection conn(_conn.toString(), timeout);

    BSONObjBuilder queryBuilder;
    queryBuilder.append(LocksType::name(), _name);
    queryBuilder.append(LocksType::state(), LocksType::UNLOCKED);

    {
        // make sure its there so we can use simple update logic below
        BSONObj o = conn->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name))).getOwned();

        // Case 1: No locks
        if (o.isEmpty()) {
            try {
                LOG(logLvl) << "inserting initial doc in " << LocksType::ConfigNS << " for lock "
                            << _name << endl;
                conn->insert(LocksType::ConfigNS,
                             BSON(LocksType::name(_name) << LocksType::state(LocksType::UNLOCKED)
                                                         << LocksType::who("")
                                                         << LocksType::lockID(OID())));
            } catch (UserException& e) {
                warning() << "could not insert initial doc for distributed lock " << _name
                          << causedBy(e) << endl;
            }
        }

        // Case 2: A set lock that we might be able to force
        else if (o[LocksType::state()].numberInt() > LocksType::UNLOCKED) {
            string lockName =
                o[LocksType::name()].String() + string("/") + o[LocksType::process()].String();

            BSONObj lastPing = conn->findOne(
                LockpingsType::ConfigNS, o[LocksType::process()].wrap(LockpingsType::process()));
            if (lastPing.isEmpty()) {
                LOG(logLvl) << "empty ping found for process in lock '" << lockName << "'" << endl;
                // TODO:  Using 0 as a "no time found" value Will fail if dates roll over, but then,
                // so will a lot.
                lastPing = BSON(LockpingsType::process(o[LocksType::process()].String())
                                << LockpingsType::ping(Date_t()));
            }

            unsigned long long elapsed = 0;
            unsigned long long takeover = lockTimeout;

            DistLockPingInfo lastPingEntry = getLastPing();

            LOG(logLvl) << "checking last ping for lock '" << lockName << "' against process "
                        << lastPingEntry.processId << " and ping " << lastPingEntry.lastPing;

            try {
                Date_t remote = remoteTime(_conn);

                auto pingDocProcessId = lastPing[LockpingsType::process()].String();
                auto pingDocPingValue = lastPing[LockpingsType::ping()].Date();

                // Timeout the elapsed time using comparisons of remote clock
                // For non-finalized locks, timeout 15 minutes since last seen (ts)
                // For finalized locks, timeout 15 minutes since last ping
                bool recPingChange = o[LocksType::state()].numberInt() == LocksType::LOCKED &&
                    (lastPingEntry.processId != pingDocProcessId ||
                     lastPingEntry.lastPing != pingDocPingValue);
                bool recTSChange = lastPingEntry.lockSessionId != o[LocksType::lockID()].OID();

                if (recPingChange || recTSChange) {
                    // If the ping has changed since we last checked, mark the current date and time
                    setLastPing(DistLockPingInfo(pingDocProcessId,
                                                 pingDocPingValue,
                                                 remote,
                                                 o[LocksType::lockID()].OID(),
                                                 OID()));
                } else {
                    // GOTCHA!  Due to network issues, it is possible that the current time
                    // is less than the remote time.  We *have* to check this here, otherwise
                    // we overflow and our lock breaks.
                    if (lastPingEntry.configLocalTime >= remote)
                        elapsed = 0;
                    else
                        elapsed =
                            durationCount<Milliseconds>(remote - lastPingEntry.configLocalTime);
                }
            } catch (LockException& e) {
                // Remote server cannot be found / is not responsive
                warning() << "Could not get remote time from " << _conn << causedBy(e);
                // If our config server is having issues, forget all the pings until we can see it
                // again
                resetLastPing();
            }

            if (elapsed <= takeover) {
                LOG(1) << "could not force lock '" << lockName << "' because elapsed time "
                       << elapsed << " <= takeover time " << takeover;
                *other = o;
                other->getOwned();
                conn.done();
                return false;
            }

            LOG(0) << "forcing lock '" << lockName << "' because elapsed time " << elapsed
                   << " > takeover time " << takeover;

            if (elapsed > takeover) {
                // Lock may forced, reset our timer if succeeds or fails
                // Ensures that another timeout must happen if something borks up here, and resets
                // our pristine ping state if acquired.
                resetLastPing();

                try {
                    // Check the clock skew again.  If we check this before we get a lock
                    // and after the lock times out, we can be pretty sure the time is
                    // increasing at the same rate on all servers and therefore our
                    // timeout is accurate
                    if (isRemoteTimeSkewed()) {
                        string msg(str::stream() << "remote time in cluster " << _conn.toString()
                                                 << " is now skewed, cannot force lock.");
                        throw LockException(msg, ErrorCodes::DistributedClockSkewed);
                    }

                    // Make sure we break the lock with the correct "ts" (OID) value, otherwise
                    // we can overwrite a new lock inserted in the meantime.
                    conn->update(LocksType::ConfigNS,
                                 BSON(LocksType::name(_name)
                                      << LocksType::state() << o[LocksType::state()].numberInt()
                                      << LocksType::lockID(o[LocksType::lockID()].OID())),
                                 BSON("$set" << BSON(LocksType::state(LocksType::UNLOCKED))));

                    BSONObj err = conn->getLastErrorDetailed();
                    string errMsg = DBClientWithCommands::getLastErrorString(err);

                    // TODO: Clean up all the extra code to exit this method, probably with a
                    // refactor
                    if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) {
                        logErrMsgOrWarn(
                            "Could not force lock", lockName, errMsg, "(another force won");
                        *other = o;
                        other->getOwned();
                        conn.done();
                        return false;
                    }

                } catch (UpdateNotTheSame&) {
                    // Ok to continue since we know we forced at least one lock document, and all
                    // lock docs are required for a lock to be held.
                    warning() << "lock forcing " << lockName << " inconsistent" << endl;
                } catch (const LockException&) {
                    // Let the exception go up and don't repackage the exception.
                    throw;
                } catch (std::exception& e) {
                    conn.done();
                    string msg(str::stream() << "exception forcing distributed lock " << lockName
                                             << causedBy(e));
                    throw LockException(msg, 13660);
                }

            } else {
                // Not strictly necessary, but helpful for small timeouts where thread
                // scheduling is significant. This ensures that two attempts are still
                // required for a force if not acquired, and resets our state if we
                // are acquired.
                resetLastPing();

                // Test that the lock is held by trying to update the finalized state of the lock to
                // the same state if it does not update or does not update on all servers, we can't
                // re-enter.
                try {
                    // Test the lock with the correct "ts" (OID) value
                    conn->update(LocksType::ConfigNS,
                                 BSON(LocksType::name(_name)
                                      << LocksType::state(LocksType::LOCKED)
                                      << LocksType::lockID(o[LocksType::lockID()].OID())),
                                 BSON("$set" << BSON(LocksType::state(LocksType::LOCKED))));

                    BSONObj err = conn->getLastErrorDetailed();
                    string errMsg = DBClientWithCommands::getLastErrorString(err);

                    // TODO: Clean up all the extra code to exit this method, probably with a
                    // refactor
                    if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) {
                        logErrMsgOrWarn(
                            "Could not re-enter lock", lockName, errMsg, "(not sure lock is held");
                        *other = o;
                        other->getOwned();
                        conn.done();
                        return false;
                    }

                } catch (UpdateNotTheSame&) {
                    // NOT ok to continue since our lock isn't held by all servers, so isn't valid.
                    warning() << "inconsistent state re-entering lock, lock " << lockName
                              << " not held" << endl;
                    *other = o;
                    other->getOwned();
                    conn.done();
                    return false;
                } catch (std::exception& e) {
                    conn.done();
                    string msg(str::stream() << "exception re-entering distributed lock "
                                             << lockName << causedBy(e));
                    throw LockException(msg, 13660);
                }

                LOG(logLvl - 1) << "re-entered distributed lock '" << lockName << "'" << endl;
                *other = o.getOwned();
                conn.done();
                return true;
            }

            LOG(logLvl - 1) << "lock '" << lockName << "' successfully forced" << endl;

            // We don't need the ts value in the query, since we will only ever replace locks with
            // state=0.
        }
        // Case 3: We have an expired lock
        else if (o[LocksType::lockID()].type()) {
            queryBuilder.append(o[LocksType::lockID()]);
        }
    }

    // Always reset our ping if we're trying to get a lock, since getting a lock implies the lock
    // state is open and no locks need to be forced.  If anything goes wrong, we don't want to
    // remember an old lock.
    resetLastPing();

    bool gotLock = false;
    BSONObj currLock;

    BSONObj lockDetails =
        BSON(LocksType::state(LocksType::LOCK_PREP)
             << LocksType::who(getDistLockId()) << LocksType::process(_processId)
             << LocksType::when(jsTime()) << LocksType::why(why) << LocksType::lockID(lockID));
    BSONObj whatIWant = BSON("$set" << lockDetails);

    BSONObj query = queryBuilder.obj();

    string lockName = _name + string("/") + _processId;

    try {
        // Main codepath to acquire lock

        LOG(logLvl) << "about to acquire distributed lock '" << lockName << "'";

        LOG(logLvl + 1) << "trying to acquire lock " << query.toString(false, true)
                        << " with details " << lockDetails.toString(false, true) << endl;

        conn->update(LocksType::ConfigNS, query, whatIWant);

        BSONObj err = conn->getLastErrorDetailed();
        string errMsg = DBClientWithCommands::getLastErrorString(err);

        currLock = conn->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name)));

        if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) {
            logErrMsgOrWarn("could not acquire lock", lockName, errMsg, "(another update won)");
            *other = currLock;
            other->getOwned();
            gotLock = false;
        } else {
            gotLock = true;
        }

    } catch (UpdateNotTheSame& up) {
        // this means our update got through on some, but not others
        warning() << "distributed lock '" << lockName << " did not propagate properly."
                  << causedBy(up) << endl;

        // Overall protection derives from:
        // All unlocking updates use the ts value when setting state to 0
        //   This ensures that during locking, we can override all smaller ts locks with
        //   our own safe ts value and not be unlocked afterward.
        for (unsigned i = 0; i < up.size(); i++) {
            ScopedDbConnection indDB(up[i].first);
            BSONObj indUpdate;

            try {
                indUpdate = indDB->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name)));
                const auto currentLockID = indUpdate[LocksType::lockID()].OID();
                // If we override this lock in any way, grab and protect it.
                // We assume/ensure that if a process does not have all lock documents, it is no
                // longer holding the lock.
                // Note - finalized locks may compete too, but we know they've won already if
                // competing in this round.  Cleanup of crashes during finalizing may take a few
                // tries.
                if (currentLockID < lockID ||
                    indUpdate[LocksType::state()].numberInt() == LocksType::UNLOCKED) {
                    BSONObj grabQuery =
                        BSON(LocksType::name(_name) << LocksType::lockID(currentLockID));

                    // Change ts so we won't be forced, state so we won't be relocked
                    BSONObj grabChanges =
                        BSON(LocksType::lockID(lockID) << LocksType::state(LocksType::LOCK_PREP));

                    // Either our update will succeed, and we'll grab the lock, or it will fail b/c
                    // some other process grabbed the lock (which will change the ts), but the lock
                    // will be set until forcing
                    indDB->update(LocksType::ConfigNS, grabQuery, BSON("$set" << grabChanges));

                    indUpdate = indDB->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name)));

                    // The tournament was interfered and it is not safe to proceed further.
                    // One case this could happen is when the LockPinger processes old
                    // entries from addUnlockOID. See SERVER-10688 for more detailed
                    // description of race.
                    if (indUpdate[LocksType::state()].numberInt() <= LocksType::UNLOCKED) {
                        LOG(logLvl - 1) << "lock tournament interrupted, "
                                        << "so no lock was taken; "
                                        << "new state of lock: " << indUpdate << endl;

                        // We now break and set our currLock lockID value to zero, so that
                        // we know that we did not acquire the lock below. Later code will
                        // cleanup failed entries.
                        currLock = BSON(LocksType::lockID(OID()));
                        indDB.done();
                        break;
                    }
                }
                // else our lock is the same, in which case we're safe, or it's a bigger lock,
                // in which case we won't need to protect anything since we won't have the lock.

            } catch (std::exception& e) {
                conn.done();
                string msg(str::stream() << "distributed lock " << lockName
                                         << " had errors communicating with individual server "
                                         << up[1].first << causedBy(e));
                throw LockException(msg, 13661, lockID);
            }

            verify(!indUpdate.isEmpty());

            // Find max TS value
            if (currLock.isEmpty() ||
                currLock[LocksType::lockID()] < indUpdate[LocksType::lockID()]) {
                currLock = indUpdate.getOwned();
            }

            indDB.done();
        }

        // Locks on all servers are now set and safe until forcing

        if (currLock[LocksType::lockID()].OID() == lockID) {
            LOG(logLvl - 1) << "lock update won, completing lock propagation for '" << lockName
                            << "'" << endl;
            gotLock = true;
        } else {
            LOG(logLvl - 1) << "lock update lost, lock '" << lockName << "' not propagated."
                            << endl;
            gotLock = false;
        }
    } catch (std::exception& e) {
        conn.done();
        string msg(str::stream() << "exception creating distributed lock " << lockName
                                 << causedBy(e));
        throw LockException(msg, 13663, lockID);
    }

    // Complete lock propagation
    if (gotLock) {
        // This is now safe, since we know that no new locks will be placed on top of the ones we've
        // checked for at least 15 minutes.  Sets the state = 2, so that future clients can
        // determine that the lock is truly set. The invariant for rollbacks is that we will never
        // force locks with state = 2 and active pings, since that indicates the lock is active, but
        // this means the process creating/destroying them must explicitly poll when something goes
        // wrong.
        try {
            BSONObjBuilder finalLockDetails;
            BSONObjIterator bi(lockDetails);
            while (bi.more()) {
                BSONElement el = bi.next();
                if ((string)(el.fieldName()) == LocksType::state())
                    finalLockDetails.append(LocksType::state(), LocksType::LOCKED);
                else
                    finalLockDetails.append(el);
            }

            conn->update(LocksType::ConfigNS,
                         BSON(LocksType::name(_name)),
                         BSON("$set" << finalLockDetails.obj()));

            BSONObj err = conn->getLastErrorDetailed();
            string errMsg = DBClientWithCommands::getLastErrorString(err);

            currLock = conn->findOne(LocksType::ConfigNS, BSON(LocksType::name(_name)));

            if (!errMsg.empty() || !err["n"].type() || err["n"].numberInt() < 1) {
                warning() << "could not finalize winning lock " << lockName
                          << (!errMsg.empty() ? causedBy(errMsg) : " (did not update lock) ")
                          << endl;
                gotLock = false;
            } else {
                // SUCCESS!
                gotLock = true;
            }

        } catch (std::exception& e) {
            conn.done();
            string msg(str::stream() << "exception finalizing winning lock" << causedBy(e));
            // Inform caller about the potential orphan lock.
            throw LockException(msg, 13662, lockID);
        }
    }

    *other = currLock;
    other->getOwned();

    // Log our lock results
    if (gotLock)
        LOG(logLvl - 1) << "distributed lock '" << lockName << "' acquired for '" << why
                        << "', ts : " << currLock[LocksType::lockID()].OID();
    else
        LOG(logLvl - 1) << "distributed lock '" << lockName << "' was not acquired.";

    conn.done();

    return gotLock;
}
Beispiel #28
0
int Balancer::_moveChunks(OperationContext* txn,
                          const BalancerChunkSelectionPolicy::MigrateInfoVector& candidateChunks,
                          const MigrationSecondaryThrottleOptions& secondaryThrottle,
                          bool waitForDelete) {
    int movedCount = 0;

    for (const auto& migrateInfo : candidateChunks) {
        // If the balancer was disabled since we started this round, don't start new chunks
        // moves.
        if (!Grid::get(txn)->getBalancerConfiguration()->isBalancerActive() ||
            MONGO_FAIL_POINT(skipBalanceRound)) {
            LOG(1) << "Stopping balancing round early as balancing was disabled";
            return movedCount;
        }

        // Changes to metadata, borked metadata, and connectivity problems between shards
        // should cause us to abort this chunk move, but shouldn't cause us to abort the entire
        // round of chunks.
        //
        // TODO(spencer): We probably *should* abort the whole round on issues communicating
        // with the config servers, but its impossible to distinguish those types of failures
        // at the moment.
        //
        // TODO: Handle all these things more cleanly, since they're expected problems

        const NamespaceString nss(migrateInfo.ns);

        try {
            auto scopedCM = uassertStatusOK(ScopedChunkManager::getExisting(txn, nss));
            ChunkManager* const cm = scopedCM.cm();

            shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, migrateInfo.minKey);

            if (c->getMin().woCompare(migrateInfo.minKey) ||
                c->getMax().woCompare(migrateInfo.maxKey)) {
                log() << "Migration " << migrateInfo
                      << " will be skipped this round due to chunk metadata mismatch.";
                scopedCM.db()->getChunkManager(txn, nss.ns(), true);
                continue;
            }

            BSONObj res;
            if (c->moveAndCommit(txn,
                                 migrateInfo.to,
                                 Grid::get(txn)->getBalancerConfiguration()->getMaxChunkSizeBytes(),
                                 secondaryThrottle,
                                 waitForDelete,
                                 0, /* maxTimeMS */
                                 res)) {
                movedCount++;
                continue;
            }

            log() << "balancer move failed: " << res << ", migrate: " << migrateInfo;

            Status moveStatus = getStatusFromCommandResult(res);

            if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) {
                log() << "Performing a split because migrate failed for size reasons";

                auto splitStatus = c->split(txn, Chunk::normal, NULL);
                if (!splitStatus.isOK()) {
                    log() << "marking chunk as jumbo: " << c->toString();

                    c->markAsJumbo(txn);

                    // We increment moveCount so we do another round right away
                    movedCount++;
                }
            }
        } catch (const DBException& ex) {
            log() << "balancer move " << migrateInfo << " failed" << causedBy(ex);
        }
    }

    return movedCount;
}
Beispiel #29
0
    bool ClientInfo::getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener ) {
        set<string> * shards = getPrev();

        if ( shards->size() == 0 ) {
            result.appendNull( "err" );
            return true;
        }

        vector<WBInfo> writebacks;

        // handle single server
        if ( shards->size() == 1 ) {
            string theShard = *(shards->begin() );

            ShardConnection conn( theShard , "", true );
            
            BSONObj res;
            bool ok = false;
            try{
            	ok = conn->runCommand( "admin" , options , res );
            }
            catch( std::exception &e ){
                
                warning() << "could not get last error." << causedBy( e ) << endl;
                
                // Catch everything that happens here, since we need to ensure we return our connection when we're
            	// finished.
            	conn.done();
                
            	return false;
            }
            
            res = res.getOwned();
            conn.done();
            

            _addWriteBack( writebacks , res );

            // hit other machines just to block
            for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
                string temp = *i;
                if ( temp == theShard )
                    continue;

                ShardConnection conn( temp , "" );
                _addWriteBack( writebacks , conn->getLastErrorDetailed() );
                conn.done();
            }
            clearSinceLastGetError();
            
            if ( writebacks.size() ){
                vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener );
                if ( v.size() == 0 && fromWriteBackListener ) {
                    // ok
                }
                else {
                    assert( v.size() == 1 );
                    result.appendElements( v[0] );
                    result.appendElementsUnique( res );
                    result.append( "writebackGLE" , v[0] );
                    result.append( "initialGLEHost" , theShard );
                }
            }
            else {
                result.append( "singleShard" , theShard );
                result.appendElements( res );
            }
            
            return ok;
        }

        BSONArrayBuilder bbb( result.subarrayStart( "shards" ) );
        BSONObjBuilder shardRawGLE;

        long long n = 0;
        
        int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true

        // hit each shard
        vector<string> errors;
        vector<BSONObj> errorObjects;
        for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) {
            string theShard = *i;
            bbb.append( theShard );
            ShardConnection conn( theShard , "", true );
            BSONObj res;
            bool ok = false;
            try {
                ok = conn->runCommand( "admin" , options , res );
                shardRawGLE.append( theShard , res );
            }
            catch( std::exception &e ){

        	    // Safe to return here, since we haven't started any extra processing yet, just collecting
        	    // responses.
                
        	    warning() << "could not get last error." << causedBy( e ) << endl;
                conn.done();
                
                return false;
            }
            
            _addWriteBack( writebacks, res );
            
            string temp = DBClientWithCommands::getLastErrorString( res );
            if ( conn->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) {
                errors.push_back( temp );
                errorObjects.push_back( res );
            }

            n += res["n"].numberLong();
            if ( res["updatedExisting"].type() ) {
                if ( res["updatedExisting"].trueValue() )
                    updatedExistingStat = 1;
                else if ( updatedExistingStat == 0 )
                    updatedExistingStat = -1;
            }

            conn.done();
        }

        bbb.done();
        result.append( "shardRawGLE" , shardRawGLE.obj() );

        result.appendNumber( "n" , n );
        if ( updatedExistingStat )
            result.appendBool( "updatedExisting" , updatedExistingStat > 0 );

        // hit other machines just to block
        for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
            string temp = *i;
            if ( shards->count( temp ) )
                continue;

            ShardConnection conn( temp , "" );
            _addWriteBack( writebacks, conn->getLastErrorDetailed() );
            conn.done();
        }
        clearSinceLastGetError();

        if ( errors.size() == 0 ) {
            result.appendNull( "err" );
            _handleWriteBacks( writebacks , fromWriteBackListener );
            return true;
        }

        result.append( "err" , errors[0].c_str() );

        {
            // errs
            BSONArrayBuilder all( result.subarrayStart( "errs" ) );
            for ( unsigned i=0; i<errors.size(); i++ ) {
                all.append( errors[i].c_str() );
            }
            all.done();
        }

        {
            // errObjects
            BSONArrayBuilder all( result.subarrayStart( "errObjects" ) );
            for ( unsigned i=0; i<errorObjects.size(); i++ ) {
                all.append( errorObjects[i] );
            }
            all.done();
        }
        _handleWriteBacks( writebacks , fromWriteBackListener );
        return true;
    }
    int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks,
                              bool secondaryThrottle,
                              bool waitForDelete)
    {
        int movedCount = 0;

        for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
            const CandidateChunk& chunkInfo = *it->get();

            // Changes to metadata, borked metadata, and connectivity problems should cause us to
            // abort this chunk move, but shouldn't cause us to abort the entire round of chunks.
            // TODO: Handle all these things more cleanly, since they're expected problems
            try {

                DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
                verify( cfg );

                // NOTE: We purposely do not reload metadata here, since _doBalanceRound already
                // tried to do so once.
                ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
                verify( cm );

                ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                    // likely a split happened somewhere
                    cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
                    verify( cm );

                    c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                    if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                        log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;
                        continue;
                    }
                }

                BSONObj res;
                if (c->moveAndCommit(Shard::make(chunkInfo.to),
                                     Chunk::MaxChunkSize,
                                     secondaryThrottle,
                                     waitForDelete,
                                     res)) {
                    movedCount++;
                    continue;
                }

                // the move requires acquiring the collection metadata's lock, which can fail
                log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
                      << " chunk: " << chunkInfo.chunk << endl;

                if ( res["chunkTooBig"].trueValue() ) {
                    // reload just to be safe
                    cm = cfg->getChunkManager( chunkInfo.ns );
                    verify( cm );
                    c = cm->findIntersectingChunk( chunkInfo.chunk.min );

                    log() << "forcing a split because migrate failed for size reasons" << endl;

                    res = BSONObj();
                    c->singleSplit( true , res );
                    log() << "forced split results: " << res << endl;

                    if ( ! res["ok"].trueValue() ) {
                        log() << "marking chunk as jumbo: " << c->toString() << endl;
                        c->markAsJumbo();
                        // we increment moveCount so we do another round right away
                        movedCount++;
                    }

                }
            }
            catch( const DBException& ex ) {
                warning() << "could not move chunk " << chunkInfo.chunk.toString()
                          << ", continuing balancing round" << causedBy( ex ) << endl;
            }
        }

        return movedCount;
    }