Beispiel #1
0
 ~ScopeCache() {
     if (inShutdown())
         return;
     clear();
 }
Beispiel #2
0
        void run() {
            if ( _token.size() == 0  && _name.size() == 0 ) {
                LOG(1) << "mms not configured" << endl;
                return;
            }

            if ( _token.size() == 0 ) {
                log() << "no token for mms - not running" << endl;
                return;
            }

            if ( _name.size() == 0 ) {
                log() << "no name for mms - not running" << endl;
                return;
            }

            log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
            Client::initThread( "mms" );
            Client& c = cc();


            // TODO: using direct client is bad, but easy for now

            while ( ! inShutdown() ) {
                sleepsecs( _secsToSleep );

                try {
                    stringstream url;
                    url << _baseurl << "?"
                        << "token=" << _token << "&"
                        << "name=" << _name << "&"
                        << "ts=" << time(0)
                        ;

                    BSONObjBuilder bb;
                    // duplicated so the post has everything
                    bb.append( "token" , _token );
                    bb.append( "name" , _name );
                    bb.appendDate( "ts" , jsTime()  );

                    // any commands
                    _add( bb , "buildinfo" );
                    _add( bb , "serverStatus" );

                    BSONObj postData = bb.obj();

                    LOG(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;

                    HttpClient c;
                    HttpClient::Result r;
                    int rc = c.post( url.str() , postData.jsonString() , &r );
                    LOG(1) << "\t response code: " << rc << endl;
                    if ( rc != 200 ) {
                        log() << "mms error response code:" << rc << endl;
                        LOG(1) << "mms error body:" << r.getEntireResponse() << endl;
                    }
                }
                catch ( std::exception& e ) {
                    log() << "mms exception: " << e.what() << endl;
                }
            }

            c.shutdown();
        }
Beispiel #3
0
    void runSyncThread() {
        Client::initThread("rsSync");
        cc().getAuthorizationSession()->grantInternalAuthorization();
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();

        // Set initial indexPrefetch setting
        const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch;
        if (!prefetch.empty()) {
            BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL;
            if (prefetch == "none")
                prefetchConfig = BackgroundSync::PREFETCH_NONE;
            else if (prefetch == "_id_only")
                prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY;
            else if (prefetch == "all")
                prefetchConfig = BackgroundSync::PREFETCH_ALL;
            else {
                warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting "
                          << "to \"all\"";
            }
            BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig);
        }

        while (!inShutdown()) {
            // After a reconfig, we may not be in the replica set anymore, so
            // check that we are in the set (and not an arbiter) before
            // trying to sync with other replicas.
            // TODO(spencer): Use a condition variable to await loading a config
            if (replCoord->getMemberState().startup()) {
                warning() << "did not receive a valid config yet, sleeping 5 seconds ";
                sleepsecs(5);
                continue;
            }

            const MemberState memberState = replCoord->getMemberState();

            // An arbiter can never transition to any other state, and doesn't replicate, ever
            if (memberState.arbiter()) {
                break;
            }

            // If we are removed then we don't belong to the set anymore
            if (memberState.removed()) {
                sleepsecs(5);
                continue;
            }

            try {

                if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) {
                    sleepsecs(1);
                    continue;
                }

                bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag();
                // Check criteria for doing an initial sync:
                // 1. If the oplog is empty, do an initial sync
                // 2. If minValid has _initialSyncFlag set, do an initial sync
                // 3. If initialSyncRequested is true
                if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() ||
                        getInitialSyncFlag() ||
                        initialSyncRequested) {
                    syncDoInitialSync();
                    continue; // start from top again in case sync failed.
                }
                if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) {
                    continue;
                }

                /* we have some data.  continue tailing. */
                SyncTail tail(BackgroundSync::get(), multiSyncApply);
                tail.oplogApplication();
            }
            catch(const DBException& e) {
                log() << "Received exception while syncing: " << e.toString();
                sleepsecs(10);
            }
            catch(const std::exception& e) {
                log() << "Received exception while syncing: " << e.what();
                sleepsecs(10);
            }
        }
        cc().shutdown();
    }
NetworkInterfaceMock::~NetworkInterfaceMock() {
    stdx::unique_lock<stdx::mutex> lk(_mutex);
    invariant(!_hasStarted || inShutdown());
    invariant(_scheduled.empty());
    invariant(_blackHoled.empty());
}
Beispiel #5
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r(false /* doHandshake */);

        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);
        }

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            while (!inShutdown()) {

                if (!r.moreInCurrentBatch()) {
                    if (theReplSet->gotForceSync()) {
                        return;
                    }

                    if (isAssumingPrimary() || theReplSet->isPrimary()) {
                        return;
                    }

                    // re-evaluate quality of sync target
                    if (shouldChangeSyncTarget()) {
                        return;
                    }
                    //record time for each getmore
                    {
                        TimerHolder batchTimer(&getmoreReplStats);
                        r.more();
                    }
                    //increment
                    networkByteStats.increment(r.currentBatchMessageSize());

                }

                if (!r.more())
                    break;

                BSONObj o = r.nextSafe().getOwned();
                opsReadStats.increment();

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _appliedBuffer = false;
                }

                OCCASIONALLY {
                    LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
                }
                // the blocking queue will wait (forever) until there's room for us to push
                _buffer.push(o);
                bufferCountGauge.increment();
                bufferSizeGauge.increment(getSize(o));

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _lastH = o["h"].numberLong();
                    _lastOpTimeFetched = o["ts"]._opTime();
                }
            } // end while

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
                    return;
                }
            }


            r.tailCheck();
            if( !r.haveCursor() ) {
                LOG(1) << "replSet end syncTail pass" << rsLog;
                return;
            }

            // looping back is ok because this is a tailable cursor
        }
    }
Beispiel #6
0
    void Listener::initAndListen() {
        checkTicketNumbers();
        vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port);
        vector<int> socks;
        SOCKET maxfd = 0; // needed for select()

        for (vector<SockAddr>::iterator it=mine.begin(), end=mine.end(); it != end; ++it) {
            SockAddr& me = *it;

            SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0);
            if ( sock == INVALID_SOCKET ) {
                log() << "ERROR: listen(): invalid socket? " << errnoWithDescription() << endl;
            }

            if (me.getType() == AF_UNIX) {
#if !defined(_WIN32)
                if (unlink(me.getAddr().c_str()) == -1) {
                    int x = errno;
                    if (x != ENOENT) {
                        log() << "couldn't unlink socket file " << me << errnoWithDescription(x) << " skipping" << endl;
                        continue;
                    }
                }
#endif
            }
            else if (me.getType() == AF_INET6) {
                // IPv6 can also accept IPv4 connections as mapped addresses (::ffff:127.0.0.1)
                // That causes a conflict if we don't do set it to IPV6_ONLY
                const int one = 1;
                setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*) &one, sizeof(one));
            }

            prebindOptions( sock );

            if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) {
                int x = errno;
                log() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl;
                if ( x == EADDRINUSE )
                    log() << "  addr already in use" << endl;
                closesocket(sock);
                return;
            }

#if !defined(_WIN32)
            if (me.getType() == AF_UNIX) {
                if (chmod(me.getAddr().c_str(), 0777) == -1) {
                    log() << "couldn't chmod socket file " << me << errnoWithDescription() << endl;
                }

                ListeningSockets::get()->addPath( me.getAddr() );
            }
#endif

            if ( ::listen(sock, 128) != 0 ) {
                log() << "listen(): listen() failed " << errnoWithDescription() << endl;
                closesocket(sock);
                return;
            }

            ListeningSockets::get()->add( sock );

            socks.push_back(sock);
            if (sock > maxfd)
                maxfd = sock;
        }

        static long connNumber = 0;
        struct timeval maxSelectTime;
        while ( ! inShutdown() ) {
            fd_set fds[1];
            FD_ZERO(fds);

            for (vector<int>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
                FD_SET(*it, fds);
            }

            maxSelectTime.tv_sec = 0;
            maxSelectTime.tv_usec = 10000;
            const int ret = select(maxfd+1, fds, NULL, NULL, &maxSelectTime);

            if (ret == 0) {
#if defined(__linux__)
                _elapsedTime += ( 10000 - maxSelectTime.tv_usec ) / 1000;
#else
                _elapsedTime += 10;
#endif
                continue;
            }
            _elapsedTime += ret; // assume 1ms to grab connection. very rough

            if (ret < 0) {
                int x = errno;
#ifdef EINTR
                if ( x == EINTR ) {
                    log() << "select() signal caught, continuing" << endl;
                    continue;
                }
#endif
                if ( ! inShutdown() )
                    log() << "select() failure: ret=" << ret << " " << errnoWithDescription(x) << endl;
                return;
            }

            for (vector<int>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
                if (! (FD_ISSET(*it, fds)))
                    continue;

                SockAddr from;
                int s = accept(*it, from.raw(), &from.addressSize);
                if ( s < 0 ) {
                    int x = errno; // so no global issues
                    if ( x == ECONNABORTED || x == EBADF ) {
                        log() << "Listener on port " << _port << " aborted" << endl;
                        return;
                    }
                    if ( x == 0 && inShutdown() ) {
                        return;   // socket closed
                    }
                    if( !inShutdown() )
                        log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl;
                    continue;
                }
                if (from.getType() != AF_UNIX)
                    disableNagle(s);
                if ( _logConnect && ! cmdLine.quiet )
                    log() << "connection accepted from " << from.toString() << " #" << ++connNumber << endl;
                accepted(s, from);
            }
        }
    }
Beispiel #7
0
    void Listener::initAndListen() {
        checkTicketNumbers();
        vector<SOCKET> socks;
        
        { // normal sockets
            vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port, (!cmdLine.noUnixSocket && useUnixSockets()));
            if ( ! _setupSockets( mine , socks ) )
                return;
        }

        SOCKET maxfd = 0; // needed for select()
        for ( unsigned i=0; i<socks.size(); i++ ) {
            if ( socks[i] > maxfd )
                maxfd = socks[i];
        }
        
#ifdef MONGO_SSL
        _logListen(_port, _ssl);
#else
        _logListen(_port, false);
#endif

        struct timeval maxSelectTime;
        while ( ! inShutdown() ) {
            fd_set fds[1];
            FD_ZERO(fds);
            
            for (vector<SOCKET>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
                FD_SET(*it, fds);
            }

            maxSelectTime.tv_sec = 0;
            maxSelectTime.tv_usec = 10000;
            const int ret = select(maxfd+1, fds, NULL, NULL, &maxSelectTime);

            if (ret == 0) {
#if defined(__linux__)
                _elapsedTime += ( 10000 - maxSelectTime.tv_usec ) / 1000;
#else
                _elapsedTime += 10;
#endif
                continue;
            }

            if (ret < 0) {
                int x = errno;
#ifdef EINTR
                if ( x == EINTR ) {
                    log() << "select() signal caught, continuing" << endl;
                    continue;
                }
#endif
                if ( ! inShutdown() )
                    log() << "select() failure: ret=" << ret << " " << errnoWithDescription(x) << endl;
                return;
            }

#if defined(__linux__)
            _elapsedTime += max(ret, (int)(( 10000 - maxSelectTime.tv_usec ) / 1000));
#else
            _elapsedTime += ret; // assume 1ms to grab connection. very rough
#endif

            for (vector<SOCKET>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) {
                if (! (FD_ISSET(*it, fds)))
                    continue;

                SockAddr from;
                int s = accept(*it, from.raw(), &from.addressSize);
                if ( s < 0 ) {
                    int x = errno; // so no global issues
                    if ( x == ECONNABORTED || x == EBADF ) {
                        log() << "Listener on port " << _port << " aborted" << endl;
                        return;
                    }
                    if ( x == 0 && inShutdown() ) {
                        return;   // socket closed
                    }
                    if( !inShutdown() ) {
                        log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl;
                        if (x == EMFILE || x == ENFILE) {
                            // Connection still in listen queue but we can't accept it yet
                            error() << "Out of file descriptors. Waiting one second before trying to accept more connections." << warnings;
                            sleepsecs(1);
                        }
                    }
                    continue;
                }
                if (from.getType() != AF_UNIX)
                    disableNagle(s);

#ifdef SO_NOSIGPIPE
                // ignore SIGPIPE signals on osx, to avoid process exit
                const int one = 1;
                setsockopt( s , SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(int));
#endif

                long long myConnectionNumber = globalConnectionNumber.addAndFetch(1);

                if ( _logConnect && ! cmdLine.quiet ){
                    int conns = globalTicketHolder.used()+1;
                    const char* word = (conns == 1 ? " connection" : " connections");
                    log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl;
                }
                
                boost::shared_ptr<Socket> pnewSock( new Socket(s, from) );
#ifdef MONGO_SSL
                if (_ssl) {
                    pnewSock->secureAccepted(_ssl);
                }
#endif
                accepted( pnewSock , myConnectionNumber );
            }
        }
    }
Beispiel #8
0
void BackgroundSync::_produce(
    OperationContext* txn,
    ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    SyncSourceResolverResponse syncSourceResp =
        _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched);

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        StorageInterface::get(txn)
            ->setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen});
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode.";
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _replCoord->signalUpstreamUpdater();
    }

    // "lastFetched" not used. Already set in _enqueueDocuments.
    Status fetcherReturnStatus = Status::OK();
    DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
        _replCoord, replicationCoordinatorExternalState, this);
    OplogFetcher* oplogFetcher;
    try {
        auto config = _replCoord->getConfig();
        auto onOplogFetcherShutdownCallbackFn =
            [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) {
                fetcherReturnStatus = status;
            };

        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _oplogFetcher =
            stdx::make_unique<OplogFetcher>(&_threadPoolTaskExecutor,
                                            OpTimeWithHash(lastHashFetched, lastOpTimeFetched),
                                            source,
                                            NamespaceString(rsOplogName),
                                            config,
                                            &dataReplicatorExternalState,
                                            stdx::bind(&BackgroundSync::_enqueueDocuments,
                                                       this,
                                                       stdx::placeholders::_1,
                                                       stdx::placeholders::_2,
                                                       stdx::placeholders::_3,
                                                       stdx::placeholders::_4),
                                            onOplogFetcherShutdownCallbackFn);
        oplogFetcher = _oplogFetcher.get();
    } catch (const mongo::DBException& ex) {
        fassertFailedWithStatus(34440, exceptionToStatus());
    }

    LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at "
           << oplogFetcher->getCommandObject_forTest()["filter"];
    auto scheduleStatus = oplogFetcher->startup();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }

    oplogFetcher->join();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << fetcherReturnStatus.toString();
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), kOplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << fetcherReturnStatus;

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        BatchBoundaries boundaries = StorageInterface::get(txn)->getMinValid(txn);
        if (!boundaries.start.isNull() || boundaries.end > lastApplied) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream()
                                      << "need to rollback, but in inconsistent state. "
                                      << "minvalid: " << boundaries.end.toString()
                                      << " > our last optime: " << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString();
    }
}
Beispiel #9
0
void RangeDeleter::doWork() {
    Client::initThreadIfNotAlready("RangeDeleter");
    Client* client = &cc();

    while (!inShutdown() && !stopRequested()) {
        string errMsg;

        RangeDeleteEntry* nextTask = NULL;

        {
            stdx::unique_lock<stdx::mutex> sl(_queueMutex);
            while (_taskQueue.empty()) {
                _taskQueueNotEmptyCV.wait_for(
                    sl, Milliseconds(kNotEmptyTimeoutMillis).toSystemDuration());

                if (stopRequested()) {
                    log() << "stopping range deleter worker" << endl;
                    return;
                }

                if (_taskQueue.empty()) {
                    // Try to check if some deletes are ready and move them to the
                    // ready queue.

                    TaskList::iterator iter = _notReadyQueue.begin();
                    while (iter != _notReadyQueue.end()) {
                        RangeDeleteEntry* entry = *iter;

                        set<CursorId> cursorsNow;
                        if (entry->options.waitForOpenCursors) {
                            auto txn = client->makeOperationContext();
                            _env->getCursorIds(txn.get(), entry->options.range.ns, &cursorsNow);
                        }

                        set<CursorId> cursorsLeft;
                        std::set_intersection(entry->cursorsToWait.begin(),
                                              entry->cursorsToWait.end(),
                                              cursorsNow.begin(),
                                              cursorsNow.end(),
                                              std::inserter(cursorsLeft, cursorsLeft.end()));

                        entry->cursorsToWait.swap(cursorsLeft);

                        if (entry->cursorsToWait.empty()) {
                            (*iter)->stats.queueEndTS = jsTime();
                            _taskQueue.push_back(*iter);
                            _taskQueueNotEmptyCV.notify_one();
                            iter = _notReadyQueue.erase(iter);
                        } else {
                            logCursorsWaiting(entry);
                            ++iter;
                        }
                    }
                }
            }

            if (stopRequested()) {
                log() << "stopping range deleter worker" << endl;
                return;
            }

            nextTask = _taskQueue.front();
            _taskQueue.pop_front();

            _deletesInProgress++;
        }

        {
            auto txn = client->makeOperationContext();
            nextTask->stats.deleteStartTS = jsTime();
            bool delResult =
                _env->deleteRange(txn.get(), *nextTask, &nextTask->stats.deletedDocCount, &errMsg);
            nextTask->stats.deleteEndTS = jsTime();

            if (delResult) {
                nextTask->stats.waitForReplStartTS = jsTime();

                if (!_waitForMajority(txn.get(), &errMsg)) {
                    warning() << "Error encountered while waiting for replication: " << errMsg;
                }

                nextTask->stats.waitForReplEndTS = jsTime();
            } else {
                warning() << "Error encountered while trying to delete range: " << redact(errMsg);
            }
        }

        {
            stdx::lock_guard<stdx::mutex> sl(_queueMutex);

            NSMinMax setEntry(nextTask->options.range.ns,
                              nextTask->options.range.minKey,
                              nextTask->options.range.maxKey);
            deletePtrElement(&_deleteSet, &setEntry);
            _deletesInProgress--;

            if (nextTask->doneSignal) {
                nextTask->doneSignal->set();
            }
        }

        recordDelStats(new DeleteJobStats(nextTask->stats));
        delete nextTask;
        nextTask = NULL;
    }
}
Beispiel #10
0
        void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager, vector<BSONObj>& insertsRemaining, map<ChunkPtr, vector<BSONObj> > insertsForChunks, int retries = 0 ) {

            uassert( 16055, str::stream() << "too many retries during bulk insert, " << insertsRemaining.size() << " inserts remaining", retries < 30 );
            uassert( 16056, str::stream() << "shutting down server during bulk insert, " << insertsRemaining.size() << " inserts remaining", ! inShutdown() );

            const int flags = d.reservedField() | InsertOption_ContinueOnError; // ContinueOnError is always on when using sharding.

            _groupInserts( manager, insertsRemaining, insertsForChunks );

            while( ! insertsForChunks.empty() ){

                ChunkPtr c = insertsForChunks.begin()->first;
                vector<BSONObj>& objs = insertsForChunks.begin()->second;

                const Shard& shard = c->getShard();
                const string& ns = r.getns();

                ShardConnection dbcon( shard, ns, manager );

                try {

                    LOG(4) << "  server:" << c->getShard().toString() << " bulk insert " << objs.size() << " documents" << endl;

                    // Taken from single-shard bulk insert, should not need multiple methods in future
                    // insert( c->getShard() , r.getns() , objs , flags);

                    // It's okay if the version is set here, an exception will be thrown if the version is incompatible
                    dbcon.setVersion();

                    dbcon->insert( ns , objs , flags);
                    // TODO: Option for safe inserts here - can then use this for all inserts

                    dbcon.done();

                    int bytesWritten = 0;
                    for (vector<BSONObj>::iterator vecIt = objs.begin(); vecIt != objs.end(); ++vecIt) {
                        r.gotInsert(); // Record the correct number of individual inserts
                        bytesWritten += (*vecIt).objsize();
                    }

                    // TODO: The only reason we're grouping by chunks here is for auto-split, more efficient
                    // to track this separately and bulk insert to shards
                    if ( r.getClientInfo()->autoSplitOk() )
                        c->splitIfShould( bytesWritten );

                }
                catch ( StaleConfigException& e ) {
                    // Cleanup the connection
                    dbcon.done();

                    // Assume the inserts did *not* succeed, so we don't want to erase them

                    int logLevel = retries < 2;
                    LOG( logLevel ) << "retrying bulk insert of " << objs.size() << " documents to chunk " << c << " because of StaleConfigException: " << e << endl;

                    if( retries > 2 ){
                        versionManager.forceRemoteCheckShardVersionCB( e.getns() );
                    }

                    // TODO:  Replace with actual chunk handling code, simplify request
                    r.reset();
                    manager = r.getChunkManager();

                    if( ! manager ) {
                        // TODO : We can probably handle this better?
                        uasserted( 14804, "collection no longer sharded" );
                    }
                    // End TODO

                    // We may need to regroup at least some of our inserts since our chunk manager may have changed
                    _insert( r, d, manager, insertsRemaining, insertsForChunks, retries + 1 );
                    return;
                }
                catch( UserException& ){
                    // Unexpected exception, so don't clean up the conn
                    dbcon.kill();

                    // These inserts won't be retried, as something weird happened here
                    insertsForChunks.erase( insertsForChunks.begin() );

                    // Throw if this is the last chunk bulk-inserted to
                    if( insertsForChunks.empty() ){
                        throw;
                    }
                }

                insertsForChunks.erase( insertsForChunks.begin() );
            }
        }
std::string NetworkInterfaceASIO::getDiagnosticString() {
    str::stream output;
    output << "NetworkInterfaceASIO";
    output << " inShutdown: " << inShutdown();
    return output;
}
Beispiel #12
0
        void threadRun( MessagingPort * inPort) {
            TicketHolderReleaser connTicketReleaser( &Listener::globalTicketHolder );

            {
                string threadName = "conn";
                if ( inPort->connectionId() > 0 )
                    threadName = str::stream() << threadName << inPort->connectionId();
                setThreadName( threadName.c_str() );
            }
            
            verify( inPort );
            inPort->psock->setLogLevel(1);
            scoped_ptr<MessagingPort> p( inPort );

            p->psock->postFork();

            string otherSide;

            Message m;
            try {
                LastError * le = new LastError();
                lastError.reset( le ); // lastError now has ownership

                otherSide = p->psock->remoteString();

                handler->connected( p.get() );

                while ( ! inShutdown() ) {
                    m.reset();
                    p->psock->clearCounters();

                    if ( ! p->recv(m) ) {
                        if( !cmdLine.quiet ){
                            int conns = Listener::globalTicketHolder.used()-1;
                            const char* word = (conns == 1 ? " connection" : " connections");
                            log() << "end connection " << otherSide << " (" << conns << word << " now open)" << endl;
                        }
                        p->shutdown();
                        break;
                    }

                    handler->process( m , p.get() , le );
                    networkCounter.hit( p->psock->getBytesIn() , p->psock->getBytesOut() );
                }
            }
            catch ( AssertionException& e ) {
                log() << "AssertionException handling request, closing client connection: " << e << endl;
                p->shutdown();
            }
            catch ( SocketException& e ) {
                log() << "SocketException handling request, closing client connection: " << e << endl;
                p->shutdown();
            }
            catch ( const DBException& e ) { // must be right above std::exception to avoid catching subclasses
                log() << "DBException handling request, closing client connection: " << e << endl;
                p->shutdown();
            }
            catch ( std::exception &e ) {
                error() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
                dbexit( EXIT_UNCAUGHT );
            }
            catch ( ... ) {
                error() << "Uncaught exception, terminating" << endl;
                dbexit( EXIT_UNCAUGHT );
            }

            handler->disconnected( p.get() );
        }
Status NetworkInterfaceASIO::startCommand(const TaskExecutor::CallbackHandle& cbHandle,
                                          RemoteCommandRequest& request,
                                          const RemoteCommandCompletionFn& onFinish) {
    MONGO_ASIO_INVARIANT(onFinish, "Invalid completion function");
    {
        stdx::lock_guard<stdx::mutex> lk(_inProgressMutex);
        const auto insertResult = _inGetConnection.emplace(cbHandle);
        // We should never see the same CallbackHandle added twice
        MONGO_ASIO_INVARIANT_INLOCK(insertResult.second, "Same CallbackHandle added twice");
    }

    if (inShutdown()) {
        return {ErrorCodes::ShutdownInProgress, "NetworkInterfaceASIO shutdown in progress"};
    }

    LOG(2) << "startCommand: " << redact(request.toString());

    auto getConnectionStartTime = now();

    auto statusMetadata = attachMetadataIfNeeded(request, _metadataHook.get());
    if (!statusMetadata.isOK()) {
        return statusMetadata;
    }

    auto nextStep = [this, getConnectionStartTime, cbHandle, request, onFinish](
        StatusWith<ConnectionPool::ConnectionHandle> swConn) {

        if (!swConn.isOK()) {
            LOG(2) << "Failed to get connection from pool for request " << request.id << ": "
                   << swConn.getStatus();

            bool wasPreviouslyCanceled = false;
            {
                stdx::lock_guard<stdx::mutex> lk(_inProgressMutex);
                wasPreviouslyCanceled = _inGetConnection.erase(cbHandle) == 0;
            }

            onFinish(wasPreviouslyCanceled
                         ? Status(ErrorCodes::CallbackCanceled, "Callback canceled")
                         : swConn.getStatus());
            signalWorkAvailable();
            return;
        }

        auto conn = static_cast<connection_pool_asio::ASIOConnection*>(swConn.getValue().get());

        AsyncOp* op = nullptr;

        stdx::unique_lock<stdx::mutex> lk(_inProgressMutex);

        const auto eraseCount = _inGetConnection.erase(cbHandle);

        // If we didn't find the request, we've been canceled
        if (eraseCount == 0) {
            lk.unlock();

            onFinish({ErrorCodes::CallbackCanceled, "Callback canceled"});

            // Though we were canceled, we know that the stream is fine, so indicate success.
            conn->indicateSuccess();

            signalWorkAvailable();

            return;
        }

        // We can't release the AsyncOp until we know we were not canceled.
        auto ownedOp = conn->releaseAsyncOp();
        op = ownedOp.get();

        // This AsyncOp may be recycled. We expect timeout and canceled to be clean.
        // If this op was most recently used to connect, its state transitions won't have been
        // reset, so we do that here.
        MONGO_ASIO_INVARIANT_INLOCK(!op->canceled(), "AsyncOp has dirty canceled flag", op);
        MONGO_ASIO_INVARIANT_INLOCK(!op->timedOut(), "AsyncOp has dirty timeout flag", op);
        op->clearStateTransitions();

        // Now that we're inProgress, an external cancel can touch our op, but
        // not until we release the inProgressMutex.
        _inProgress.emplace(op, std::move(ownedOp));

        op->_cbHandle = std::move(cbHandle);
        op->_request = std::move(request);
        op->_onFinish = std::move(onFinish);
        op->_connectionPoolHandle = std::move(swConn.getValue());
        op->startProgress(getConnectionStartTime);

        // This ditches the lock and gets us onto the strand (so we're
        // threadsafe)
        op->_strand.post([this, op, getConnectionStartTime] {
            // Set timeout now that we have the correct request object
            if (op->_request.timeout != RemoteCommandRequest::kNoTimeout) {
                // Subtract the time it took to get the connection from the pool from the request
                // timeout.
                auto getConnectionDuration = now() - getConnectionStartTime;
                if (getConnectionDuration >= op->_request.timeout) {
                    // We only assume that the request timer is guaranteed to fire *after* the
                    // timeout duration - but make no stronger assumption. It is thus possible that
                    // we have already exceeded the timeout. In this case we timeout the operation
                    // manually.
                    std::stringstream msg;
                    msg << "Remote command timed out while waiting to get a connection from the "
                        << "pool, took " << getConnectionDuration;
                    return _completeOperation(op, {ErrorCodes::ExceededTimeLimit, msg.str()});
                }

                // The above conditional guarantees that the adjusted timeout will never underflow.
                MONGO_ASIO_INVARIANT(
                    op->_request.timeout > getConnectionDuration, "timeout underflowed", op);
                const auto adjustedTimeout = op->_request.timeout - getConnectionDuration;
                const auto requestId = op->_request.id;

                op->_timeoutAlarm = op->_owner->_timerFactory->make(&op->_strand, adjustedTimeout);

                std::shared_ptr<AsyncOp::AccessControl> access;
                std::size_t generation;
                {
                    stdx::lock_guard<stdx::mutex> lk(op->_access->mutex);
                    access = op->_access;
                    generation = access->id;
                }

                op->_timeoutAlarm->asyncWait(
                    [this, op, access, generation, requestId, adjustedTimeout](std::error_code ec) {
                        // We must pass a check for safe access before using op inside the
                        // callback or we may attempt access on an invalid pointer.
                        stdx::lock_guard<stdx::mutex> lk(access->mutex);
                        if (generation != access->id) {
                            // The operation has been cleaned up, do not access.
                            return;
                        }

                        if (!ec) {
                            LOG(2) << "Request " << requestId << " timed out"
                                   << ", adjusted timeout after getting connection from pool was "
                                   << adjustedTimeout << ", op was " << redact(op->toString());

                            op->timeOut_inlock();
                        } else {
                            LOG(2) << "Failed to time request " << requestId
                                   << "out: " << ec.message() << ", op was "
                                   << redact(op->toString());
                        }
                    });
            }

            _beginCommunication(op);
        });
    };

    _connectionPool.get(request.target, request.timeout, nextStep);
    return Status::OK();
}
Beispiel #14
0
    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
        bool ok = true;

        DbMessage d(m);

        const char *ns = d.getns();
        int ntoreturn = d.pullInt();
        long long cursorid = d.pullInt64();

        curop.debug().ns = ns;
        curop.debug().ntoreturn = ntoreturn;
        curop.debug().cursorid = cursorid;

        shared_ptr<AssertionException> ex;
        scoped_ptr<Timer> timer;
        int pass = 0;
        bool exhaust = false;
        QueryResult* msgdata = 0;
        OpTime last;
        while( 1 ) {
            try {
                if (str::startsWith(ns, "local.oplog.")){
                    if (pass == 0) {
                        mutex::scoped_lock lk(OpTime::m);
                        last = OpTime::getLast(lk);
                    }
                    else {
                        last.waitForDifferent(1000/*ms*/);
                    }
                }

                Client::ReadContext ctx(ns);

                // call this readlocked so state can't change
                replVerifyReadsOk();
                msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust);
            }
            catch ( AssertionException& e ) {
                ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
                ok = false;
                break;
            }
            
            if (msgdata == 0) {
                // this should only happen with QueryOption_AwaitData
                exhaust = false;
                massert(13073, "shutting down", !inShutdown() );
                if ( ! timer ) {
                    timer.reset( new Timer() );
                }
                else {
                    if ( timer->seconds() >= 4 ) {
                        // after about 4 seconds, return. pass stops at 1000 normally.
                        // we want to return occasionally so slave can checkpoint.
                        pass = 10000;
                    }
                }
                pass++;
                if (debug)
                    sleepmillis(20);
                else
                    sleepmillis(2);
                
                // note: the 1100 is beacuse of the waitForDifferent above
                // should eventually clean this up a bit
                curop.setExpectedLatencyMs( 1100 + timer->millis() );
                
                continue;
            }
            break;
        };

        if (ex) {
            exhaust = false;

            BSONObjBuilder err;
            ex->getInfo().append( err );
            BSONObj errObj = err.done();

            log() << errObj << endl;

            curop.debug().exceptionInfo = ex->getInfo();

            if (ex->getCode() == 13436) {
                replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj);
                curop.debug().responseLength = dbresponse.response->header()->dataLen();
                curop.debug().nreturned = 1;
                return ok;
            }

            msgdata = emptyMoreResult(cursorid);
        }

        Message *resp = new Message();
        resp->setData(msgdata, true);
        curop.debug().responseLength = resp->header()->dataLen();
        curop.debug().nreturned = msgdata->nReturned;

        dbresponse.response = resp;
        dbresponse.responseTo = m.header()->id;
        
        if( exhaust ) {
            curop.debug().exhaust = true;
            dbresponse.exhaust = ns;
        }

        return ok;
    }
Beispiel #15
0
        /**
         * Handles incoming messages from a given socket.
         *
         * Terminating conditions:
         * 1. Assertions while handling the request.
         * 2. Socket is closed.
         * 3. Server is shutting down (based on inShutdown)
         *
         * @param arg this method is in charge of cleaning up the arg object.
         *
         * @return NULL
         */
        static void* handleIncomingMsg(void* arg) {
            TicketHolderReleaser connTicketReleaser( &Listener::globalTicketHolder );

            scoped_ptr<HandleIncomingMsgParam> himArg(static_cast<HandleIncomingMsgParam*>(arg));
            MessagingPort* inPort = himArg->inPort;
            MessageHandler* handler = himArg->handler;

            {
                string threadName = "conn";
                if ( inPort->connectionId() > 0 )
                    threadName = str::stream() << threadName << inPort->connectionId();
                setThreadName( threadName.c_str() );
            }

            verify( inPort );
            inPort->psock->setLogLevel(logger::LogSeverity::Debug(1));
            scoped_ptr<MessagingPort> p( inPort );

            string otherSide;

            Message m;
            try {
                LastError * le = new LastError();
                lastError.reset( le ); // lastError now has ownership

                otherSide = p->psock->remoteString();

                handler->connected( p.get() );

                while ( ! inShutdown() ) {
                    m.reset();
                    p->psock->clearCounters();

                    if ( ! p->recv(m) ) {
                        if (!serverGlobalParams.quiet) {
                            int conns = Listener::globalTicketHolder.used()-1;
                            const char* word = (conns == 1 ? " connection" : " connections");
                            log() << "end connection " << otherSide << " (" << conns << word << " now open)" << endl;
                        }
                        p->shutdown();
                        break;
                    }

                    handler->process( m , p.get() , le );
                    networkCounter.hit( p->psock->getBytesIn() , p->psock->getBytesOut() );
                }
            }
            catch ( AssertionException& e ) {
                log() << "AssertionException handling request, closing client connection: " << e << endl;
                p->shutdown();
            }
            catch ( SocketException& e ) {
                log() << "SocketException handling request, closing client connection: " << e << endl;
                p->shutdown();
            }
            catch ( const DBException& e ) { // must be right above std::exception to avoid catching subclasses
                log() << "DBException handling request, closing client connection: " << e << endl;
                p->shutdown();
            }
            catch ( std::exception &e ) {
                error() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
                dbexit( EXIT_UNCAUGHT );
            }

            // Normal disconnect path.
#ifdef MONGO_SSL
            SSLManagerInterface* manager = getSSLManager();
            if (manager)
                manager->cleanupThreadLocals();
#endif
            handler->disconnected( p.get() );

            return NULL;
        }
Beispiel #16
0
    void Listener::initAndListen() {
        if (!_setupSocketsSuccessful) {
            return;
        }

#ifdef MONGO_SSL
        _logListen(_port, _ssl);
#else
        _logListen(_port, false);
#endif
                
        OwnedPointerVector<EventHolder> eventHolders;
        boost::scoped_array<WSAEVENT> events(new WSAEVENT[_socks.size()]);
        
        
        // Populate events array with an event for each socket we are watching
        for (size_t count = 0; count < _socks.size(); ++count) {
            EventHolder* ev(new EventHolder);
            eventHolders.mutableVector().push_back(ev);
            events[count] = ev->get();            
        }
            
        while ( ! inShutdown() ) {
            // Turn on listening for accept-ready sockets
            for (size_t count = 0; count < _socks.size(); ++count) {
                int status = WSAEventSelect(_socks[count], events[count], FD_ACCEPT | FD_CLOSE);
                if (status == SOCKET_ERROR) {
                    const int mongo_errno = WSAGetLastError();

                    // During shutdown, we may fail to listen on the socket if it has already
                    // been closed
                    if (inShutdown()) {
                        return;
                    }

                    error() << "Windows WSAEventSelect returned "
                        << errnoWithDescription(mongo_errno) << endl;
                    fassertFailed(16727);
                }
            }
        
            // Wait till one of them goes active, or we time out
            DWORD result = WSAWaitForMultipleEvents(_socks.size(),
                                                    events.get(), 
                                                    FALSE, // don't wait for all the events
                                                    10, // timeout, in ms 
                                                    FALSE); // do not allow I/O interruptions
            if (result == WSA_WAIT_FAILED) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAWaitForMultipleEvents returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                fassertFailed(16723);
            }
        
            if (result == WSA_WAIT_TIMEOUT) {
                _elapsedTime += 10;
                continue;
            }
            _elapsedTime += 1; // assume 1ms to grab connection. very rough
            
            // Determine which socket is ready
            DWORD eventIndex = result - WSA_WAIT_EVENT_0;
            WSANETWORKEVENTS networkEvents;            
            // Extract event details, and clear event for next pass
            int status = WSAEnumNetworkEvents(_socks[eventIndex],
                                              events[eventIndex], 
                                              &networkEvents);
            if (status == SOCKET_ERROR) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAEnumNetworkEvents returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                continue;
            }
            
            if (networkEvents.lNetworkEvents & FD_CLOSE) {              
                log() << "listen socket closed" << endl;
                break;
            }
            
            if (!(networkEvents.lNetworkEvents & FD_ACCEPT)) {
                error() << "Unexpected network event: " << networkEvents.lNetworkEvents << endl;
                continue;
            }
            
            int iec = networkEvents.iErrorCode[FD_ACCEPT_BIT];
            if (iec != 0) {                 
                error() << "Windows socket accept did not work:" << errnoWithDescription(iec) 
                        << endl;
                continue;
            }
            
            status = WSAEventSelect(_socks[eventIndex], NULL, 0);
            if (status == SOCKET_ERROR) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAEventSelect returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                continue;
            }
            
            disableNonblockingMode(_socks[eventIndex]);
            
            SockAddr from;
            int s = accept(_socks[eventIndex], from.raw(), &from.addressSize);
            if ( s < 0 ) {
                int x = errno; // so no global issues
                if (x == EBADF) {
                    log() << "Port " << _port << " is no longer valid" << endl;
                    continue;
                }
                else if (x == ECONNABORTED) {
                    log() << "Listener on port " << _port << " aborted" << endl;
                    continue;
                }
                if ( x == 0 && inShutdown() ) {
                    return;   // socket closed
                }
                if( !inShutdown() ) {
                    log() << "Listener: accept() returns " << s << " " 
                        << errnoWithDescription(x) << endl;
                    if (x == EMFILE || x == ENFILE) {
                        // Connection still in listen queue but we can't accept it yet
                        error() << "Out of file descriptors. Waiting one second before"
                            " trying to accept more connections." << warnings;
                        sleepsecs(1);
                    }
                }
                continue;
            }
            if (from.getType() != AF_UNIX)
                disableNagle(s);

            long long myConnectionNumber = globalConnectionNumber.addAndFetch(1);

            if (_logConnect && !serverGlobalParams.quiet) {
                int conns = globalTicketHolder.used()+1;
                const char* word = (conns == 1 ? " connection" : " connections");
                log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl;
            }
            
            boost::shared_ptr<Socket> pnewSock( new Socket(s, from) );
#ifdef MONGO_SSL
            if (_ssl) {
                pnewSock->secureAccepted(_ssl);
            }
#endif
            accepted( pnewSock , myConnectionNumber );
        }
    }
void TransportLayerMock::shutdown() {
    if (!inShutdown()) {
        _shutdown = true;
        endAllSessions(Session::kEmptyTagMask);
    }
}
Beispiel #18
0
void RangeDeleter::doWork() {
    _env->initThread();

    while (!inShutdown() && !stopRequested()) {
        string errMsg;

        boost::scoped_ptr<OperationContext> txn(getGlobalEnvironment()->newOpCtx());

        RangeDeleteEntry* nextTask = NULL;

        {
            scoped_lock sl(_queueMutex);
            while (_taskQueue.empty()) {
                _taskQueueNotEmptyCV.timed_wait(sl.boost(),
                                                duration::milliseconds(kNotEmptyTimeoutMillis));

                if (stopRequested()) {
                    log() << "stopping range deleter worker" << endl;
                    return;
                }

                if (_taskQueue.empty()) {
                    // Try to check if some deletes are ready and move them to the
                    // ready queue.

                    TaskList::iterator iter = _notReadyQueue.begin();
                    while (iter != _notReadyQueue.end()) {
                        RangeDeleteEntry* entry = *iter;

                        set<CursorId> cursorsNow;
                        {
                            if (entry->options.waitForOpenCursors) {
                                _env->getCursorIds(txn.get(), entry->options.range.ns, &cursorsNow);
                            }
                        }

                        set<CursorId> cursorsLeft;
                        std::set_intersection(entry->cursorsToWait.begin(),
                                              entry->cursorsToWait.end(),
                                              cursorsNow.begin(),
                                              cursorsNow.end(),
                                              std::inserter(cursorsLeft, cursorsLeft.end()));

                        entry->cursorsToWait.swap(cursorsLeft);

                        if (entry->cursorsToWait.empty()) {
                            (*iter)->stats.queueEndTS = jsTime();
                            _taskQueue.push_back(*iter);
                            _taskQueueNotEmptyCV.notify_one();
                            iter = _notReadyQueue.erase(iter);
                        } else {
                            logCursorsWaiting(entry);
                            ++iter;
                        }
                    }
                }
            }

            if (stopRequested()) {
                log() << "stopping range deleter worker" << endl;
                return;
            }

            nextTask = _taskQueue.front();
            _taskQueue.pop_front();

            _deletesInProgress++;
        }

        {
            nextTask->stats.deleteStartTS = jsTime();
            bool delResult =
                _env->deleteRange(txn.get(), *nextTask, &nextTask->stats.deletedDocCount, &errMsg);
            nextTask->stats.deleteEndTS = jsTime();

            if (delResult) {
                nextTask->stats.waitForReplStartTS = jsTime();

                if (!_waitForMajority(txn.get(), &errMsg)) {
                    warning() << "Error encountered while waiting for replication: " << errMsg;
                }

                nextTask->stats.waitForReplEndTS = jsTime();
            } else {
                warning() << "Error encountered while trying to delete range: " << errMsg << endl;
            }
        }

        {
            scoped_lock sl(_queueMutex);

            NSMinMax setEntry(nextTask->options.range.ns,
                              nextTask->options.range.minKey,
                              nextTask->options.range.maxKey);
            deletePtrElement(&_deleteSet, &setEntry);
            _deletesInProgress--;

            if (nextTask->notifyDone) {
                nextTask->notifyDone->notifyOne();
            }
        }

        recordDelStats(new DeleteJobStats(nextTask->stats));
        delete nextTask;
        nextTask = NULL;
    }
}
Beispiel #19
0
    void BackgroundSync::produce(OperationContext* txn) {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer && !inShutdown()) {
                _condvar.wait(lock);
            }
            if (inShutdown()) {
                return;
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }


        // find a target to sync from the last optime fetched
        OpTime lastOpTimeFetched;
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
            _syncSourceHost = HostAndPort();
        }
        _syncSourceReader.resetConnection();
        _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            // no server found
            if (_syncSourceReader.getHost().empty()) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
            _syncSourceHost = _syncSourceReader.getHost();
            _replCoord->signalUpstreamUpdater();
        }

        _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!_syncSourceReader.haveCursor()) {
            return;
        }

        if (_rollbackIfNeeded(txn, _syncSourceReader)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!_syncSourceReader.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = _syncSourceReader.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    sleepmillis(SleepToAllowBatchingMillis);
                }

                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (_replCoord->isWaitingForApplierToDrain() || 
                    _replCoord->getCurrentMemberState().primary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncSource()) {
                    return;
                }

                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    _syncSourceReader.more();
                }
                networkByteStats.increment(_syncSourceReader.currentBatchMessageSize());

                if (!_syncSourceReader.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause) {
                            return;
                        }
                    }

                    _syncSourceReader.tailCheck();
                    if( !_syncSourceReader.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass";
                        return;
                    }

                    continue;
                }
            }

            // If we are transitioning to primary state, we need to leave
            // this loop in order to go into bgsync-pause mode.
            if (_replCoord->isWaitingForApplierToDrain() ||
                _replCoord->getCurrentMemberState().primary()) {
                LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
                return;
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = _syncSourceReader.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
            }

            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));
            _buffer.push(o);

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastFetchedHash = o["h"].numberLong();
                _lastOpTimeFetched = o["ts"]._opTime();
                LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty();
            }
        }
    }
Beispiel #20
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r;
        OpTime lastOpTimeFetched;
        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        r.tailingQueryGTE(rsoplog, lastOpTimeFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!r.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = r.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    sleepmillis(SleepToAllowBatchingMillis);
                }
  
                if (theReplSet->gotForceSync()) {
                    return;
                }
                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (isAssumingPrimary() || theReplSet->isPrimary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncTarget()) {
                    return;
                }


                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    r.more();
                }
                networkByteStats.increment(r.currentBatchMessageSize());

                if (!r.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause || 
                            !_currentSyncTarget || 
                            !_currentSyncTarget->hbinfo().hbstate.readable()) {
                            return;
                        }
                    }

                    r.tailCheck();
                    if( !r.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass" << rsLog;
                        return;
                    }

                    continue;
                }
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = r.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
            }
            // the blocking queue will wait (forever) until there's room for us to push
            _buffer.push(o);
            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastH = o["h"].numberLong();
                _lastOpTimeFetched = o["ts"]._opTime();
                LOG(3) << "replSet lastOpTimeFetched: "
                       << _lastOpTimeFetched.toStringPretty() << rsLog;
            }
        }
    }
Beispiel #21
0
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        int sleepTime = 10;

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {

                ScopedDbConnection conn(config.toString(), 30);

                // ping has to be first so we keep things in the config server in sync
                _ping();

                // use fresh shard state
                Shard::reloadShardInfo();

                // refresh chunk size (even though another balancer might be active)
                Chunk::refreshChunkSize();

                SettingsType balancerConfig;
                string errMsg;

                if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) {
                    warning() << errMsg;
                    return ;
                }

                // now make sure we should even be running
                if (balancerConfig.isKeySet() && // balancer config doc exists
                        !grid.shouldBalance(balancerConfig)) {

                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping( true );

                    conn.done();

                    sleepsecs( sleepTime );
                    continue;
                }

                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;

                        // Ping again so scripts can determine if we're active without waiting
                        _ping( true );

                        conn.done();
                        
                        sleepsecs( sleepTime ); // no need to wake up soon
                        continue;
                    }

                    if ( !isConfigServerConsistent() ) {
                        conn.done();
                        warning() << "Skipping balancing round because data inconsistency"
                                  << " was detected amongst the config servers." << endl;
                        sleepsecs( sleepTime );
                        continue;
                    }

                    const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ?
                            balancerConfig.getWaitForDelete() : false);

                    scoped_ptr<WriteConcernOptions> writeConcern;
                    if (balancerConfig.isKeySet()) { // if balancer doc exists.
                        StatusWith<WriteConcernOptions*> extractStatus =
                                balancerConfig.extractWriteConcern();
                        if (extractStatus.isOK()) {
                            writeConcern.reset(extractStatus.getValue());
                        }
                        else {
                            warning() << extractStatus.toString();
                        }
                    }

                    LOG(1) << "*** start balancing round. "
                           << "waitForDelete: " << waitForDelete
                           << ", secondaryThrottle: "
                           << (writeConcern.get() ? writeConcern->toBSON().toString() : "default")
                           << endl;

                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                        _balancedLastTime = 0;
                    }
                    else {
                        _balancedLastTime = _moveChunks(&candidateChunks,
                                                        writeConcern.get(),
                                                        waitForDelete );
                    }

                    LOG(1) << "*** end of balancing round" << endl;
                }

                // Ping again so scripts can determine if we're active without waiting
                _ping( true );
                
                conn.done();

                sleepsecs( _balancedLastTime ? sleepTime / 10 : sleepTime );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( sleepTime ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }
Beispiel #22
0
        void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
            const int flags = d.reservedField();
            bool keepGoing = flags & InsertOption_KeepGoing; // modified before assertion if should abort

            while ( d.moreJSObjs() ) {
                try {
                    BSONObj o = d.nextJsObj();
                    if ( ! manager->hasShardKey( o ) ) {

                        bool bad = true;

                        if ( manager->getShardKey().partOfShardKey( "_id" ) ) {
                            BSONObjBuilder b;
                            b.appendOID( "_id" , 0 , true );
                            b.appendElements( o );
                            o = b.obj();
                            bad = ! manager->hasShardKey( o );
                        }

                        if ( bad ) {
                            log() << "tried to insert object without shard key: " << r.getns() << "  " << o << endl;
                            uasserted( 8011 , "tried to insert object without shard key" );
                        }

                    }

                    // Many operations benefit from having the shard key early in the object
                    o = manager->getShardKey().moveToFront(o);

                    const int maxTries = 30;

                    bool gotThrough = false;
                    for ( int i=0; i<maxTries; i++ ) {
                        try {
                            ChunkPtr c = manager->findChunk( o );
                            log(4) << "  server:" << c->getShard().toString() << " " << o << endl;
                            insert( c->getShard() , r.getns() , o , flags);

                            r.gotInsert();
                            if ( r.getClientInfo()->autoSplitOk() )
                                c->splitIfShould( o.objsize() );
                            gotThrough = true;
                            break;
                        }
                        catch ( StaleConfigException& e ) {
                            int logLevel = i < ( maxTries / 2 );
                            LOG( logLevel ) << "retrying insert because of StaleConfigException: " << e << " object: " << o << endl;
                            r.reset();
                            
                            unsigned long long old = manager->getSequenceNumber();
                            manager = r.getChunkManager();
                            
                            LOG( logLevel ) << "  sequenece number - old: " << old << " new: " << manager->getSequenceNumber() << endl;

                            if (!manager) {
                                keepGoing = false;
                                uasserted(14804, "collection no longer sharded");
                            }
                        }
                        sleepmillis( i * 20 );
                    }
                    
                    assert( inShutdown() || gotThrough ); // not caught below
                } catch (const UserException&){
                    if (!keepGoing || !d.moreJSObjs()){
                        throw;
                    }
                    // otherwise ignore and keep going
                }
            }
        }
Beispiel #23
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r(false /* doHandshake */);

        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);
        }

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            while (!inShutdown()) {
                if (!r.moreInCurrentBatch()) {
                    if (theReplSet->gotForceSync()) {
                        return;
                    }

                    if (theReplSet->isPrimary()) {
                        return;
                    }

                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (!_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
                            return;
                        }
                    }

                    r.more();
                }

                if (!r.more())
                    break;

                BSONObj o = r.nextSafe().getOwned();

                Timer timer;
                // the blocking queue will wait (forever) until there's room for us to push
                OCCASIONALLY {
                    LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
                }
                _buffer.push(o);

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);

                    // update counters
                    _queueCounter.waitTime += timer.millis();
                    _queueCounter.numElems++;
                    _lastH = o["h"].numberLong();
                    _lastOpTimeFetched = o["ts"]._opTime();
                }
            } // end while

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
                    return;
                }
            }


            r.tailCheck();
            if( !r.haveCursor() ) {
                LOG(1) << "replSet end syncTail pass" << rsLog;
                return;
            }

            // looping back is ok because this is a tailable cursor
        }
    }
Beispiel #24
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Status* remoteOplogStartStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        return;
    }

    if (inShutdown()) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    const auto& queryResponse = result.getValue();

    // Forward metadata (containing liveness information) to replication coordinator.
    bool receivedMetadata =
        queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName);
    if (receivedMetadata) {
        auto metadataResult =
            rpc::ReplSetMetadata::readFromMetadata(queryResponse.otherFields.metadata);
        if (!metadataResult.isOK()) {
            error() << "invalid replication metadata from sync source " << source << ": "
                    << metadataResult.getStatus() << ": " << queryResponse.otherFields.metadata;
            return;
        }
        _replCoord->processReplSetMetadata(metadataResult.getValue());
    }

    const auto& documents = queryResponse.documents;
    auto documentBegin = documents.cbegin();
    auto documentEnd = documents.cend();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> {
            if (documentBegin == documentEnd) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(documentBegin++);
        };

        *remoteOplogStartStatus =
            checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!remoteOplogStartStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(documentBegin != documents.cbegin());
    }

    // process documents
    int currentBatchMessageSize = 0;
    for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) {
        if (inShutdown()) {
            return;
        }

        // If we are transitioning to primary state, we need to leave
        // this loop in order to go into bgsync-pause mode.
        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
            LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
            return;
        }

        // At this point, we are guaranteed to have at least one thing to read out
        // of the fetcher.
        const BSONObj& o = *documentIter;
        currentBatchMessageSize += o.objsize();
        opsReadStats.increment();

        if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) {
            sleepsecs(20);
        }

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _appliedBuffer = false;
        }

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        bufferCountGauge.increment();
        bufferSizeGauge.increment(getSize(o));
        _buffer.push(o);

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = o["h"].numberLong();
            _lastOpTimeFetched = fassertStatusOK(28770, OpTime::parseFromBSON(o));
            LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }

    // record time for each batch
    getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis));

    networkByteStats.increment(currentBatchMessageSize);

    // Check some things periodically
    // (whenever we run out of items in the
    // current cursor batch)
    if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) {
        // on a very low latency network, if we don't wait a little, we'll be
        // getting ops to write almost one at a time.  this will both be expensive
        // for the upstream server as well as potentially defeating our parallel
        // application of batches on the secondary.
        //
        // the inference here is basically if the batch is really small, we are
        // "caught up".
        //
        sleepmillis(SleepToAllowBatchingMillis);
    }

    // If we are transitioning to primary state, we need to leave
    // this loop in order to go into bgsync-pause mode.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        return;
    }

    // re-evaluate quality of sync target
    if (_shouldChangeSyncSource(source)) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    // We fill in 'bob' to signal the fetcher to process with another getMore.
    invariant(bob);
    bob->append("getMore", queryResponse.cursorId);
    bob->append("collection", queryResponse.nss.coll());
    bob->append("maxTimeMS", durationCount<Milliseconds>(fetcherMaxTimeMS));
    if (receivedMetadata) {
        bob->append("term", _replCoord->getTerm());
    }
}
Beispiel #25
0
/* tail an oplog.  ok to return, will be re-called. */
void SyncTail::oplogApplication() {
    OpQueueBatcher batcher(this);

    OperationContextImpl txn;
    auto replCoord = ReplicationCoordinator::get(&txn);
    ApplyBatchFinalizer finalizer(replCoord);

    OpTime originalEndOpTime(getMinValid(&txn).end);
    while (!inShutdown()) {
        OpQueue ops;

        do {
            if (BackgroundSync::get()->getInitialSyncRequestedFlag()) {
                // got a resync command
                return;
            }

            tryToGoLiveAsASecondary(&txn, replCoord);

            // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become
            // ready in time, we'll loop again so we can do the above checks periodically.
            ops = batcher.getNextBatch(Seconds(1));
        } while (!inShutdown() && ops.empty());

        if (inShutdown())
            return;

        invariant(!ops.empty());

        const BSONObj lastOp = ops.back().raw;

        if (lastOp.isEmpty()) {
            // This means that the network thread has coalesced and we have processed all of its
            // data.
            invariant(ops.getDeque().size() == 1);
            if (replCoord->isWaitingForApplierToDrain()) {
                replCoord->signalDrainComplete(&txn);
            }
            continue;  // This wasn't a real op. Don't try to apply it.
        }

        handleSlaveDelay(lastOp);

        // Set minValid to the last OpTime that needs to be applied, in this batch or from the
        // (last) failed batch, whichever is larger.
        // This will cause this node to go into RECOVERING state
        // if we should crash and restart before updating finishing.
        const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm);

        // Take the max of the first endOptime (if we recovered) and the end of our batch.
        const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp));

        // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch)
        // ensures that we keep pushing out the point where we can become consistent
        // and allow reads. If we recover and end up doing smaller batches we must pass the
        // originalEndOpTime before we are good.
        //
        // For example:
        // batch apply, 20-40, end = 40
        // batch failure,
        // restart
        // batch apply, 20-25, end = max(25, 40) = 40
        // batch apply, 25-45, end = 45
        const OpTime end(std::max(originalEndOpTime, lastOpTime));

        // This write will not journal/checkpoint.
        setMinValid(&txn, {start, end});

        OpTime finalOpTime = multiApply(&txn, ops);
        setNewTimestamp(finalOpTime.getTimestamp());

        setMinValid(&txn, end, DurableRequirement::None);
        finalizer.record(finalOpTime);
    }
}
Beispiel #26
0
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {
                
                ScopedDbConnection conn( config );
                
                // ping has to be first so we keep things in the config server in sync
                _ping( conn.conn() );

                // now make sure we should even be running
                if ( ! grid.shouldBalance() ) {
                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;
                    conn.done();
                    
                    sleepsecs( 30 );
                    continue;
                }
                
                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                // use fresh shard state
                Shard::reloadShardInfo();

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;
                        conn.done();
                        
                        sleepsecs( 30 ); // no need to wake up soon
                        continue;
                    }
                    
                    LOG(1) << "*** start balancing round" << endl;
                    
                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                    }
                    else {
                        _balancedLastTime = _moveChunks( &candidateChunks );
                    }
                    
                    LOG(1) << "*** end of balancing round" << endl;
                }
                
                conn.done();
                    
                sleepsecs( _balancedLastTime ? 5 : 10 );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( 30 ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }
    void SyncSourceFeedback::run() {
        Client::initThread("SyncSourceFeedbackThread");
        OperationContextImpl txn;

        bool positionChanged = false;
        bool handshakeNeeded = false;
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
        while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone.
            {
                boost::unique_lock<boost::mutex> lock(_mtx);
                while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) {
                    _cond.wait(lock);
                }

                if (_shutdownSignaled) {
                    break;
                }

                positionChanged = _positionChanged;
                handshakeNeeded = _handshakeNeeded;
                _positionChanged = false;
                _handshakeNeeded = false;
            }

            MemberState state = replCoord->getCurrentMemberState();
            if (state.primary() || state.fatal() || state.startup()) {
                _resetConnection();
                continue;
            }
            const Member* target = BackgroundSync::get()->getSyncTarget();
            if (_syncTarget != target) {
                _resetConnection();
                _syncTarget = target;
            }
            if (!hasConnection()) {
                // fix connection if need be
                if (!target) {
                    sleepmillis(500);
                    continue;
                }
                if (!_connect(&txn, target->h())) {
                    sleepmillis(500);
                    continue;
                }
                handshakeNeeded = true;
            }
            if (handshakeNeeded) {
                if (!replHandshake(&txn)) {
                    boost::unique_lock<boost::mutex> lock(_mtx);
                    _handshakeNeeded = true;
                    continue;
                }
            }
            if (positionChanged) {
                if (!updateUpstream(&txn)) {
                    boost::unique_lock<boost::mutex> lock(_mtx);
                    _positionChanged = true;
                }
            }
        }
        cc().shutdown();
    }
Beispiel #28
0
    void WriteBackListener::run() {

        int secsToSleep = 0;
        scoped_ptr<ChunkVersion> lastNeededVersion;
        int lastNeededCount = 0;
        bool needsToReloadShardInfo = false;

        while ( ! inShutdown() ) {

            if ( ! Shard::isAShardNode( _addr ) ) {
                LOG(1) << _addr << " is not a shard node" << endl;
                sleepsecs( 60 );
                continue;
            }

            try {
                if (needsToReloadShardInfo) {
                    // It's possible this shard was removed
                    Shard::reloadShardInfo();
                    needsToReloadShardInfo = false;
                }

                ScopedDbConnection conn(_addr);

                BSONObj result;

                {
                    BSONObjBuilder cmd;
                    cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data
                    if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ) {
                        result = result.getOwned();
                        log() <<  "writebacklisten command failed!  "  << result << endl;
                        conn.done();
                        continue;
                    }

                }
                conn.done();

                LOG(1) << "writebacklisten result: " << result << endl;

                BSONObj data = result.getObjectField( "data" );
                if ( data.getBoolField( "writeBack" ) ) {
                    string ns = data["ns"].valuestrsafe();

                    ConnectionIdent cid( "" , 0 );
                    OID wid;
                    if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) {
                        string s = "";
                        if ( data["instanceIdent"].type() == String )
                            s = data["instanceIdent"].String();
                        cid = ConnectionIdent( s , data["connectionId"].numberLong() );
                        wid = data["id"].OID();
                    }
                    else {
                        warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl;
                    }

                    int len; // not used, but needed for next call
                    Message msg( (void*)data["msg"].binData( len ) , false );
                    massert( 10427 ,  "invalid writeback message" , msg.header()->valid() );

                    DBConfigPtr db = grid.getDBConfig( ns );
                    ChunkVersion needVersion = ChunkVersion::fromBSON( data, "version" );

                    //
                    // TODO: Refactor the sharded strategy to correctly handle all sharding state changes itself,
                    // we can't rely on WBL to do this for us b/c anything could reset our state in-between.
                    // We should always reload here for efficiency when possible, but staleness is also caught in the
                    // loop below.
                    //

                    ChunkManagerPtr manager;
                    ShardPtr primary;
                    db->getChunkManagerOrPrimary( ns, manager, primary );

                    ChunkVersion currVersion;
                    if( manager ) currVersion = manager->getVersion();

                    LOG(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString()
                           << " mine : " << currVersion.toString() << endl;

                    LOG(1) << msg.toString() << endl;

                    //
                    // We should reload only if we need to update our version to be compatible *and* we
                    // haven't already done so.  This avoids lots of reloading when we remove/add a sharded collection
                    //

                    bool alreadyReloaded = lastNeededVersion &&
                                           lastNeededVersion->isEquivalentTo( needVersion );

                    if( alreadyReloaded ){

                        LOG(1) << "wbl already reloaded config information for version "
                               << needVersion << ", at version " << currVersion << endl;
                    }
                    else if( lastNeededVersion ) {

                        log() << "new version change detected to " << needVersion.toString()
                              << ", " << lastNeededCount << " writebacks processed at "
                              << lastNeededVersion->toString() << endl;

                        lastNeededCount = 0;
                    }

                    //
                    // Set our lastNeededVersion for next time
                    //

                    lastNeededVersion.reset( new ChunkVersion( needVersion ) );
                    lastNeededCount++;

                    //
                    // Determine if we should reload, if so, reload
                    //

                    bool shouldReload = ! needVersion.isWriteCompatibleWith( currVersion ) &&
                                        ! alreadyReloaded;

                    if( shouldReload && currVersion.isSet()
                                     && needVersion.isSet()
                                     && currVersion.hasCompatibleEpoch( needVersion ) )
                    {

                        //
                        // If we disagree about versions only, reload the chunk manager
                        //

                        db->getChunkManagerIfExists( ns, true );
                    }
                    else if( shouldReload ){

                        //
                        // If we disagree about anything else, reload the full db
                        //

                        warning() << "reloading config data for " << db->getName() << ", "
                                  << "wanted version " << needVersion.toString()
                                  << " but currently have version " << currVersion.toString() << endl;

                        db->reload();
                    }

                    // do request and then call getLastError
                    // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError

                    BSONObj gle;
                    int attempts = 0;
                    while ( true ) {
                        attempts++;

                        try {

                            Request r( msg , 0 );
                            r.init();

                            r.d().reservedField() |= Reserved_FromWriteback;

                            ClientInfo * ci = r.getClientInfo();
                            if (AuthorizationManager::isAuthEnabled()) {
                                ci->getAuthorizationSession()->grantInternalAuthorization(
                                        UserName("_writebackListener", "local"));
                            }
                            ci->noAutoSplit();

                            r.process( attempts );

                            ci->newRequest(); // this so we flip prev and cur shards

                            BSONObjBuilder b;
                            string errmsg;
                            if ( ! ci->getLastError( "admin",
                                                     BSON( "getLastError" << 1 ),
                                                     b,
                                                     errmsg,
                                                     true ) )
                            {
                                b.appendBool( "commandFailed" , true );
                                if( ! b.hasField( "errmsg" ) ){

                                    b.append( "errmsg", errmsg );
                                    gle = b.obj();
                                }
                                else if( errmsg.size() > 0 ){

                                    // Rebuild GLE object with errmsg
                                    // TODO: Make this less clumsy by improving GLE interface
                                    gle = b.obj();

                                    if( gle["errmsg"].type() == String ){

                                        BSONObj gleNoErrmsg =
                                                gle.filterFieldsUndotted( BSON( "errmsg" << 1 ),
                                                                          false );
                                        BSONObjBuilder bb;
                                        bb.appendElements( gleNoErrmsg );
                                        bb.append( "errmsg", gle["errmsg"].String() +
                                                             " ::and:: " +
                                                             errmsg );
                                        gle = bb.obj().getOwned();
                                    }
                                }
                            }
                            else{
                                gle = b.obj();
                            }

                            if ( gle["code"].numberInt() == 9517 ) {

                                log() << "new version change detected, "
                                      << lastNeededCount << " writebacks processed previously" << endl;

                                lastNeededVersion.reset();
                                lastNeededCount = 1;

                                log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl;
                                LOG(1) << "writeback error : " << gle << endl;

                                //
                                // Bringing this in line with the similar retry logic elsewhere
                                //
                                // TODO: Reloading the chunk manager may not help if we dropped a
                                // collection, but we don't actually have that info in the writeback
                                // error
                                //

                                if( attempts <= 2 ){
                                    db->getChunkManagerIfExists( ns, true );
                                }
                                else{
                                    versionManager.forceRemoteCheckShardVersionCB( ns );
                                    sleepsecs( attempts - 1 );
                                }

                                uassert( 15884, str::stream()
                                         << "Could not reload chunk manager after "
                                         << attempts << " attempts.", attempts <= 4 );

                                continue;
                            }

                            ci->clearSinceLastGetError();
                        }
                        catch ( DBException& e ) {
                            error() << "error processing writeback: " << e << endl;
                            BSONObjBuilder b;
                            e.getInfo().append( b, "err", "code" );
                            gle = b.obj();
                        }

                        break;
                    }

                    {
                        scoped_lock lk( _seenWritebacksLock );
                        WBStatus& s = _seenWritebacks[cid];
                        s.id = wid;
                        s.gle = gle;
                    }
                }
                else if ( result["noop"].trueValue() ) {
                    // no-op
                }
                else {
                    log() << "unknown writeBack result: " << result << endl;
                }

                secsToSleep = 0;
                continue;
            }
            catch ( std::exception& e ) {
                // Attention! Do not call any method that would throw an exception
                // (or assert) in this block.

                if ( inShutdown() ) {
                    // we're shutting down, so just clean up
                    return;
                }

                log() << "WriteBackListener exception : " << e.what() << endl;

                needsToReloadShardInfo = true;
            }
            catch ( ... ) {
                log() << "WriteBackListener uncaught exception!" << endl;
            }
            secsToSleep++;
            sleepsecs(secsToSleep);
            if ( secsToSleep > 10 )
                secsToSleep = 0;
        }

        log() << "WriteBackListener exiting : address no longer in cluster " << _addr;

    }
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        int sleepTime = 30;

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {

                ScopedDbConnection conn(config.toString(), 30);

                // ping has to be first so we keep things in the config server in sync
                _ping( conn.conn() );

                // use fresh shard state
                Shard::reloadShardInfo();

                // refresh chunk size (even though another balancer might be active)
                Chunk::refreshChunkSize();

                BSONObj balancerConfig;
                // now make sure we should even be running
                if ( ! grid.shouldBalance( "", &balancerConfig ) ) {
                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping( conn.conn(), true );

                    conn.done();

                    sleepsecs( sleepTime );
                    continue;
                }

                sleepTime = balancerConfig[SettingsType::shortBalancerSleep()].trueValue() ? 30 :
                                                                                             6;
                
                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;

                        // Ping again so scripts can determine if we're active without waiting
                        _ping( conn.conn(), true );

                        conn.done();
                        
                        sleepsecs( sleepTime ); // no need to wake up soon
                        continue;
                    }
                    
                    LOG(1) << "*** start balancing round" << endl;

                    bool waitForDelete = false;
                    if (balancerConfig["_waitForDelete"].trueValue()) {
                        waitForDelete = balancerConfig["_waitForDelete"].trueValue();
                    }

                    bool secondaryThrottle = true; // default to on
                    if ( balancerConfig[SettingsType::secondaryThrottle()].type() ) {
                        secondaryThrottle = balancerConfig[SettingsType::secondaryThrottle()].trueValue();
                    }

                    LOG(1) << "waitForDelete: " << waitForDelete << endl;
                    LOG(1) << "secondaryThrottle: " << secondaryThrottle << endl;

                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                        _balancedLastTime = 0;
                    }
                    else {
                        _balancedLastTime = _moveChunks(&candidateChunks,
                                                        secondaryThrottle,
                                                        waitForDelete );
                    }

                    LOG(1) << "*** end of balancing round" << endl;
                }

                // Ping again so scripts can determine if we're active without waiting
                _ping( conn.conn(), true );
                
                conn.done();

                sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( sleepTime ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }
Beispiel #30
0
    void WriteBackListener::run(){
        OID lastID;
        lastID.clear();
        int secsToSleep = 0;
        while ( ! inShutdown() && Shard::isMember( _addr ) ){
                
            if ( lastID.isSet() ){
                scoped_lock lk( _seenWritebacksLock );
                _seenWritebacks.insert( lastID );
                lastID.clear();
            }

            try {
                ScopedDbConnection conn( _addr );
                    
                BSONObj result;
                    
                {
                    BSONObjBuilder cmd;
                    cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data
                    if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ){
                        log() <<  "writebacklisten command failed!  "  << result << endl;
                        conn.done();
                        continue;
                    }

                }
                    
                log(1) << "writebacklisten result: " << result << endl;
                    
                BSONObj data = result.getObjectField( "data" );
                if ( data.getBoolField( "writeBack" ) ){
                    string ns = data["ns"].valuestrsafe();
                    {
                        BSONElement e = data["id"];
                        if ( e.type() == jstOID )
                            lastID = e.OID();
                    }
                    int len;

                    Message m( (void*)data["msg"].binData( len ) , false );
                    massert( 10427 ,  "invalid writeback message" , m.header()->valid() );                        

                    DBConfigPtr db = grid.getDBConfig( ns );
                    ShardChunkVersion needVersion( data["version"] );
                        
                    log(1) << "writeback id: " << lastID << " needVersion : " << needVersion.toString() 
                           << " mine : " << db->getChunkManager( ns )->getVersion().toString() << endl;// TODO change to log(3)
                        
                    if ( logLevel ) log(1) << debugString( m ) << endl;

                    if ( needVersion.isSet() && needVersion <= db->getChunkManager( ns )->getVersion() ){
                        // this means when the write went originally, the version was old
                        // if we're here, it means we've already updated the config, so don't need to do again
                        //db->getChunkManager( ns , true ); // SERVER-1349
                    }
                    else {
                        // we received a writeback object that was sent to a previous version of a shard
                        // the actual shard may not have the object the writeback operation is for 
                        // we need to reload the chunk manager and get the new shard versions
                        db->getChunkManager( ns , true );
                    }
                        
                    Request r( m , 0 );
                    r.init();
                    r.process();
                }
                else if ( result["noop"].trueValue() ){
                    // no-op
                }
                else {
                    log() << "unknown writeBack result: " << result << endl;
                }
                    
                conn.done();
                secsToSleep = 0;
                continue;
            }
            catch ( std::exception e ){

                if ( inShutdown() ){
                    // we're shutting down, so just clean up
                    return;
                }

                log() << "WriteBackListener exception : " << e.what() << endl;
                    
                // It's possible this shard was removed
                Shard::reloadShardInfo();                    
            }
            catch ( ... ){
                log() << "WriteBackListener uncaught exception!" << endl;
            }
            secsToSleep++;
            sleepsecs(secsToSleep);
            if ( secsToSleep > 10 )
                secsToSleep = 0;
        }

        log() << "WriteBackListener exiting : address no longer in cluster " << _addr;

    }